Merge branch 'rolling' into feat-rate-limiter-for-websurfx

2023-09-11 11:40:10 +05:30 · 2023-09-11 11:40:10 +05:30 · 2790eefba8
commit 2790eefba8
parent 86991a2f9a 867753a135
14 changed files with 253 additions and 74 deletions
--- a/src/engines/duckduckgo.rs
+++ b/src/engines/duckduckgo.rs
@ -43,6 +43,7 @@ impl SearchEngine for DuckDuckGo {
        page: u32,
        user_agent: &str,
        request_timeout: u8,
+        _safe_search: u8,
    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
--- a/src/engines/engine_models.rs
+++ b/src/engines/engine_models.rs
@ -71,6 +71,7 @@ pub trait SearchEngine: Sync + Send {
        page: u32,
        user_agent: &str,
        request_timeout: u8,
+        safe_search: u8,
    ) -> Result<HashMap<String, SearchResult>, EngineError>;
 }

--- a/src/engines/searx.rs
+++ b/src/engines/searx.rs
@ -42,12 +42,21 @@ impl SearchEngine for Searx {
        page: u32,
        user_agent: &str,
        request_timeout: u8,
+        mut safe_search: u8,
    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
+        if safe_search == 3 {
+            safe_search = 2;
+        };
+
        let url: String = match page {
-            0 | 1 => format!("https://searx.work/search?q={query}&pageno=1"),
-            _ => format!("https://searx.work/search?q={query}&pageno={page}"),
+            0 | 1 => {
+                format!("https://searx.work/search?q={query}&pageno=1&safesearch={safe_search}")
+            }
+            _ => format!(
+                "https://searx.work/search?q={query}&pageno={page}&safesearch={safe_search}"
+            ),
        };

        // initializing headers and adding appropriate headers.
--- a/src/results/aggregation_models.rs
+++ b/src/results/aggregation_models.rs
@ -102,13 +102,15 @@ impl EngineErrorInfo {
 /// and the type of error that caused it.
 /// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
 /// given search query.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Default)]
 #[serde(rename_all = "camelCase")]
 pub struct SearchResults {
    pub results: Vec<SearchResult>,
    pub page_query: String,
    pub style: Style,
-    pub engine_errors_info: SmallVec<[EngineErrorInfo; 0]>,
+    pub engine_errors_info: Vec<EngineErrorInfo>,
+    pub disallowed: bool,
+    pub filtered: bool,
 }

 impl SearchResults {
@ -122,6 +124,7 @@ impl SearchResults {
    /// the search url.
    /// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
    /// given search query.
+    /// * ``
    pub fn new(
        results: Vec<SearchResult>,
        page_query: &str,
@ -131,12 +134,38 @@ impl SearchResults {
            results,
            page_query: page_query.to_owned(),
            style: Style::default(),
-            engine_errors_info: SmallVec::from(engine_errors_info),
+            engine_errors_info: engine_errors_info.to_owned(),
+            disallowed: Default::default(),
+            filtered: Default::default(),
        }
    }

    /// A setter function to add website style to the return search results.
    pub fn add_style(&mut self, style: &Style) {
-        self.style = style.to_owned();
+        self.style = style.clone();
+    }
+
+    /// A setter function that sets disallowed to true.
+    pub fn set_disallowed(&mut self) {
+        self.disallowed = true;
+    }
+
+    /// A setter function to set the current page search query.
+    pub fn set_page_query(&mut self, page: &str) {
+        self.page_query = page.to_owned();
+    }
+
+    /// A setter function that sets the filtered to true.
+    pub fn set_filtered(&mut self) {
+        self.filtered = true;
+    }
+
+    /// A getter function that gets the value of `engine_errors_info`.
+    pub fn engine_errors_info(&mut self) -> Vec<EngineErrorInfo> {
+        std::mem::take(&mut self.engine_errors_info)
+    }
+    /// A getter function that gets the value of `results`.
+    pub fn results(&mut self) -> Vec<SearchResult> {
+        self.results.clone()
    }
 }
--- a/src/results/aggregator.rs
+++ b/src/results/aggregator.rs
@ -70,6 +70,7 @@ pub async fn aggregate(
    debug: bool,
    upstream_search_engines: &[EngineHandler],
    request_timeout: u8,
+    safe_search: u8,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
    let user_agent: &str = random_user_agent();

@ -91,7 +92,13 @@ pub async fn aggregate(
        let query: String = query.to_owned();
        tasks.push(tokio::spawn(async move {
            search_engine
-                .results(&query, page, user_agent, request_timeout)
+                .results(
+                    &query,
+                    page,
+                    user_agent.clone(),
+                    request_timeout,
+                    safe_search,
+                )
                .await
        }));
    }
@ -150,20 +157,22 @@ pub async fn aggregate(
        }
    }

-    let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
-    filter_with_lists(
-        &mut result_map,
-        &mut blacklist_map,
-        file_path(FileType::BlockList)?,
-    )?;
+    if safe_search >= 3 {
+        let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
+        filter_with_lists(
+            &mut result_map,
+            &mut blacklist_map,
+            file_path(FileType::BlockList)?,
+        )?;

-    filter_with_lists(
-        &mut blacklist_map,
-        &mut result_map,
-        file_path(FileType::AllowList)?,
-    )?;
+        filter_with_lists(
+            &mut blacklist_map,
+            &mut result_map,
+            file_path(FileType::AllowList)?,
+        )?;

-    drop(blacklist_map);
+        drop(blacklist_map);
+    }

    let results: Vec<SearchResult> = result_map.into_values().collect();

@ -189,7 +198,7 @@ pub fn filter_with_lists(
    let mut reader = BufReader::new(File::open(file_path)?);

    for line in reader.by_ref().lines() {
-        let re = Regex::new(&line?)?;
+        let re = Regex::new(line?.trim())?;

        // Iterate over each search result in the map and check if it matches the regex pattern
        for (url, search_result) in map_to_be_filtered.clone().into_iter() {
--- a/src/server/routes.rs
+++ b/src/server/routes.rs
@ -2,7 +2,10 @@
 //! meta search engine website and provide appropriate response to each route/page
 //! when requested.

-use std::fs::read_to_string;
+use std::{
+    fs::{read_to_string, File},
+    io::{BufRead, BufReader, Read},
+};

 use crate::{
    cache::cacher::RedisCache,
@ -13,12 +16,13 @@ use crate::{
 };
 use actix_web::{get, web, HttpRequest, HttpResponse};
 use handlebars::Handlebars;
+use regex::Regex;
 use serde::Deserialize;
 use tokio::join;

 // ---- Constants ----
 /// Initialize redis cache connection once and store it on the heap.
-const REDIS_CACHE: async_once_cell::OnceCell<RedisCache> = async_once_cell::OnceCell::new();
+static REDIS_CACHE: async_once_cell::OnceCell<RedisCache> = async_once_cell::OnceCell::new();

 /// A named struct which deserializes all the user provided search parameters and stores them.
 ///
@ -32,6 +36,7 @@ const REDIS_CACHE: async_once_cell::OnceCell<RedisCache> = async_once_cell::Once
 struct SearchParams {
    q: Option<String>,
    page: Option<u32>,
+    safesearch: Option<u8>,
 }

 /// Handles the route of index page or main page of the `websurfx` meta search engine website.
@ -105,42 +110,58 @@ pub async fn search(
                None => 1,
            };

+            let safe_search: u8 = match config.safe_search {
+                3..=4 => config.safe_search,
+                _ => match &params.safesearch {
+                    Some(safesearch) => match safesearch {
+                        0..=2 => *safesearch,
+                        _ => 1,
+                    },
+                    None => config.safe_search,
+                },
+            };
+
            let (_, results, _) = join!(
                results(
                    format!(
-                        "http://{}:{}/search?q={}&page={}",
+                        "http://{}:{}/search?q={}&page={}&safesearch={}",
                        config.binding_ip,
                        config.port,
                        query,
-                        page - 1
+                        page - 1,
+                        safe_search
                    ),
                    &config,
                    query,
                    page - 1,
-                    &req,
+                    req.clone(),
+                    safe_search
                ),
                results(
                    format!(
-                        "http://{}:{}/search?q={}&page={}",
-                        config.binding_ip, config.port, query, page
+                        "http://{}:{}/search?q={}&page={}&safesearch={}",
+                        config.binding_ip, config.port, query, page, safe_search
                    ),
                    &config,
                    query,
                    page,
-                    &req,
+                    req.clone(),
+                    safe_search
                ),
                results(
                    format!(
-                        "http://{}:{}/search?q={}&page={}",
+                        "http://{}:{}/search?q={}&page={}&safesearch={}",
                        config.binding_ip,
                        config.port,
                        query,
-                        page + 1
+                        page + 1,
+                        safe_search
                    ),
                    &config,
                    query,
                    page + 1,
-                    &req,
+                    req.clone(),
+                    safe_search
                )
            );

@ -160,9 +181,10 @@ async fn results(
    config: &Config,
    query: &str,
    page: u32,
-    req: &HttpRequest,
+    req: HttpRequest,
+    safe_search: u8,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
-    let redis_cache: RedisCache = REDIS_CACHE
+    let mut redis_cache: RedisCache = REDIS_CACHE
        .get_or_init(async {
            // Initialize redis cache connection pool only one and store it in the heap.
            RedisCache::new(&config.redis_url, 5).await.unwrap()
@ -178,6 +200,23 @@ async fn results(
    match cached_results_json {
        Ok(results) => Ok(serde_json::from_str::<SearchResults>(&results)?),
        Err(_) => {
+            if safe_search == 4 {
+                let mut results: SearchResults = SearchResults::default();
+                let mut _flag: bool =
+                    is_match_from_filter_list(file_path(FileType::BlockList)?, query)?;
+                _flag = !is_match_from_filter_list(file_path(FileType::AllowList)?, query)?;
+
+                if _flag {
+                    results.set_disallowed();
+                    results.add_style(&config.style);
+                    results.set_page_query(query);
+                    redis_cache
+                        .cache_results(&serde_json::to_string(&results)?, &url)
+                        .await?;
+                    return Ok(results);
+                }
+            }
+
            // check if the cookie value is empty or not if it is empty then use the
            // default selected upstream search engines from the config file otherwise
            // parse the non-empty cookie and grab the user selected engines from the
@ -199,6 +238,7 @@ async fn results(
                        config.debug,
                        &engines,
                        config.request_timeout,
+                        safe_search,
                    )
                    .await?
                }
@ -210,14 +250,16 @@ async fn results(
                        config.debug,
                        &config.upstream_search_engines,
                        config.request_timeout,
+                        safe_search,
                    )
                    .await?
                }
            };
-
+            if results.engine_errors_info().is_empty() && results.results().is_empty() {
+                results.set_filtered();
+            }
            results.add_style(&config.style);
            redis_cache
-                .clone()
                .cache_results(&serde_json::to_string(&results)?, &url)
                .await?;
            Ok(results)
@ -225,6 +267,22 @@ async fn results(
    }
 }

+fn is_match_from_filter_list(
+    file_path: &str,
+    query: &str,
+) -> Result<bool, Box<dyn std::error::Error>> {
+    let mut flag = false;
+    let mut reader = BufReader::new(File::open(file_path)?);
+    for line in reader.by_ref().lines() {
+        let re = Regex::new(&line?)?;
+        if re.is_match(query) {
+            flag = true;
+            break;
+        }
+    }
+    Ok(flag)
+}
+
 /// Handles the route of robots.txt page of the `websurfx` meta search engine website.
 #[get("/robots.txt")]
 pub async fn robots_data(_req: HttpRequest) -> Result<HttpResponse, Box<dyn std::error::Error>> {