Merge branch 'rolling' into feat-error-box-for-engine-errors

2023-08-24 08:02:03 +08:00 · 2023-08-24 08:02:03 +08:00 · 575a7f95ba
commit 575a7f95ba
parent d1eea5b0e3 9d2fb6c946
13 changed files with 344 additions and 349 deletions
--- a/src/bin/websurfx.rs
+++ b/src/bin/websurfx.rs
@ -15,7 +15,7 @@ use websurfx::{config::parser::Config, run};
 #[actix_web::main]
 async fn main() -> std::io::Result<()> {
    // Initialize the parsed config file.
-    let config = Config::parse(true).unwrap();
+    let config = Config::parse(false).unwrap();

    log::info!(
        "started server on port {} and IP {}",
--- a/src/config/parser.rs
+++ b/src/config/parser.rs
@ -34,7 +34,7 @@ pub struct Config {
    pub aggregator: AggregatorConfig,
    pub logging: bool,
    pub debug: bool,
-    pub upstream_search_engines: Vec<String>,
+    pub upstream_search_engines: Vec<crate::engines::engine_models::EngineHandler>,
    pub request_timeout: u8,
    pub threads: u8,
 }
@ -57,7 +57,7 @@ impl Config {
    /// # Arguments
    ///
    /// * `logging_initialized` - It takes a boolean which ensures that the logging doesn't get
-    /// initialized twice.
+    /// initialized twice. Pass false if the logger has not yet been initialized.
    ///
    /// # Error
    ///
@ -77,22 +77,8 @@ impl Config {
            let debug: bool = globals.get::<_, bool>("debug")?;
            let logging:bool= globals.get::<_, bool>("logging")?;

-            // Check whether logging has not been initialized before.
-            if logging_initialized {
-                if let Ok(pkg_env_var) = std::env::var("PKG_ENV"){
-                    if pkg_env_var.to_lowercase() == "dev" {
-                        env_logger::Builder::new().filter(None, LevelFilter::Trace).init();
-                    }
-                } else {
-                    // Initializing logging middleware with level set to default or info.
-                    let mut log_level: LevelFilter = LevelFilter::Error;
-                    if logging && debug == false {
-                        log_level = LevelFilter::Info;
-                    } else if debug {
-                        log_level = LevelFilter::Debug;
-                    };
-                    env_logger::Builder::new().filter(None, log_level).init();
-                }
+            if !logging_initialized {
+                set_logging_level(debug, logging);
            }

            let threads: u8 = if parsed_threads == 0 {
@ -121,12 +107,14 @@ impl Config {
                    .get::<_, HashMap<String, bool>>("upstream_search_engines")?
                    .into_iter()
                    .filter_map(|(key, value)| value.then_some(key))
+                    .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
                    .collect(),
                request_timeout: globals.get::<_, u8>("request_timeout")?,
                threads,
            })
        })
    }
+
    /// A helper function which returns an appropriate config file path checking if the config
    /// file exists on that path.
    ///
@ -173,3 +161,25 @@ impl Config {
        Err("Config file not found!!".to_string().into())
    }
 }
+
+/// a helper function that sets the proper logging level
+fn set_logging_level(debug: bool, logging: bool) {
+    if let Ok(pkg_env_var) = std::env::var("PKG_ENV") {
+        if pkg_env_var.to_lowercase() == "dev" {
+            env_logger::Builder::new()
+                .filter(None, LevelFilter::Trace)
+                .init();
+            return;
+        }
+    }
+
+    // Initializing logging middleware with level set to default or info.
+    let log_level = match (debug, logging) {
+        (true, true) => LevelFilter::Debug,
+        (true, false) => LevelFilter::Debug,
+        (false, true) => LevelFilter::Info,
+        (false, false) => LevelFilter::Error,
+    };
+
+    env_logger::Builder::new().filter(None, log_level).init();
+}
--- a/src/engines/duckduckgo.rs
+++ b/src/engines/duckduckgo.rs
@ -7,7 +7,7 @@ use std::collections::HashMap;
 use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};

-use crate::results::aggregation_models::RawSearchResult;
+use crate::results::aggregation_models::SearchResult;

 use super::engine_models::{EngineError, SearchEngine};

@ -43,7 +43,7 @@ impl SearchEngine for DuckDuckGo {
        page: u32,
        user_agent: String,
        request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
+    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
        let url: String = match page {
@ -120,7 +120,7 @@ impl SearchEngine for DuckDuckGo {
        Ok(document
            .select(&results)
            .map(|result| {
-                RawSearchResult::new(
+                SearchResult::new(
                    result
                        .select(&result_title)
                        .next()
@ -147,7 +147,7 @@ impl SearchEngine for DuckDuckGo {
                    vec!["duckduckgo".to_string()],
                )
            })
-            .map(|search_result| (search_result.visiting_url.clone(), search_result))
+            .map(|search_result| (search_result.url.clone(), search_result))
            .collect())
    }
 }
--- a/src/engines/engine_models.rs
+++ b/src/engines/engine_models.rs
@ -1,7 +1,7 @@
 //! This module provides the error enum to handle different errors associated while requesting data from
 //! the upstream search engines with the search query provided by the user.

-use crate::results::aggregation_models::RawSearchResult;
+use crate::results::aggregation_models::SearchResult;
 use error_stack::{IntoReport, Result, ResultExt};
 use std::{collections::HashMap, fmt, time::Duration};

@ -43,9 +43,9 @@ impl fmt::Display for EngineError {

 impl error_stack::Context for EngineError {}

-/// A trait to define common behaviour for all search engines.
+/// A trait to define common behavior for all search engines.
 #[async_trait::async_trait]
-pub trait SearchEngine {
+pub trait SearchEngine: Sync + Send {
    async fn fetch_html_from_upstream(
        &self,
        url: String,
@ -56,7 +56,7 @@ pub trait SearchEngine {
        Ok(reqwest::Client::new()
            .get(url)
            .timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
-            .headers(header_map) // add spoofed headers to emulate human behaviour
+            .headers(header_map) // add spoofed headers to emulate human behavior
            .send()
            .await
            .into_report()
@ -73,5 +73,37 @@ pub trait SearchEngine {
        page: u32,
        user_agent: String,
        request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError>;
+    ) -> Result<HashMap<String, SearchResult>, EngineError>;
+}
+
+pub struct EngineHandler {
+    engine: Box<dyn SearchEngine>,
+    name: &'static str,
+}
+
+impl Clone for EngineHandler {
+    fn clone(&self) -> Self {
+        Self::new(self.name).unwrap()
+    }
+}
+
+impl EngineHandler {
+    /// parses an engine name into an engine handler, returns none if the engine is unknown
+    pub fn new(engine_name: &str) -> Option<Self> {
+        let engine: (&'static str, Box<dyn SearchEngine>) =
+            match engine_name.to_lowercase().as_str() {
+                "duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)),
+                "searx" => ("searx", Box::new(super::searx::Searx)),
+                _ => return None,
+            };
+
+        Some(Self {
+            engine: engine.1,
+            name: engine.0,
+        })
+    }
+
+    pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
+        (self.name, self.engine)
+    }
 }
--- a/src/engines/searx.rs
+++ b/src/engines/searx.rs
@ -6,7 +6,7 @@ use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
 use std::collections::HashMap;

-use crate::results::aggregation_models::RawSearchResult;
+use crate::results::aggregation_models::SearchResult;

 use super::engine_models::{EngineError, SearchEngine};
 use error_stack::{IntoReport, Report, Result, ResultExt};
@ -42,7 +42,7 @@ impl SearchEngine for Searx {
        page: u32,
        user_agent: String,
        request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
+    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
        let url: String = match page {
@ -111,7 +111,7 @@ impl SearchEngine for Searx {
        Ok(document
            .select(&results)
            .map(|result| {
-                RawSearchResult::new(
+                SearchResult::new(
                    result
                        .select(&result_title)
                        .next()
@ -137,7 +137,7 @@ impl SearchEngine for Searx {
                    vec!["searx".to_string()],
                )
            })
-            .map(|search_result| (search_result.visiting_url.clone(), search_result))
+            .map(|search_result| (search_result.url.clone(), search_result))
            .collect())
    }
 }
--- a/src/results/aggregation_models.rs
+++ b/src/results/aggregation_models.rs
@ -5,55 +5,6 @@ use serde::{Deserialize, Serialize};

 use crate::{config::parser_models::Style, engines::engine_models::EngineError};

-/// A named struct to store, serialize and deserializes the individual search result from all the
-/// scraped and aggregated search results from the upstream search engines.
-///
-/// # Fields
-///
-/// * `title` - The title of the search result.
-/// * `visiting_url` - The url which is accessed when clicked on it (href url in html in simple
-/// words).
-/// * `url` - The url to be displayed below the search result title in html.
-/// * `description` - The description of the search result.
-/// * `engine` - The names of the upstream engines from which this results were provided.
-#[derive(Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct SearchResult {
-    pub title: String,
-    pub visiting_url: String,
-    pub url: String,
-    pub description: String,
-    pub engine: Vec<String>,
-}
-
-impl SearchResult {
-    /// Constructs a new `SearchResult` with the given arguments needed for the struct.
-    ///
-    /// # Arguments
-    ///
-    /// * `title` - The title of the search result.
-    /// * `visiting_url` - The url which is accessed when clicked on it
-    /// (href url in html in simple words).
-    /// * `url` - The url to be displayed below the search result title in html.
-    /// * `description` - The description of the search result.
-    /// * `engine` - The names of the upstream engines from which this results were provided.
-    pub fn new(
-        title: String,
-        visiting_url: String,
-        url: String,
-        description: String,
-        engine: Vec<String>,
-    ) -> Self {
-        SearchResult {
-            title,
-            visiting_url,
-            url,
-            description,
-            engine,
-        }
-    }
-}
-
 /// A named struct to store the raw scraped search results scraped search results from the
 /// upstream search engines before aggregating it.It derives the Clone trait which is needed
 /// to write idiomatic rust using `Iterators`.
@ -61,37 +12,33 @@ impl SearchResult {
 /// # Fields
 ///
 /// * `title` - The title of the search result.
-/// * `visiting_url` - The url which is accessed when clicked on it
+/// * `url` - The url which is accessed when clicked on it
 /// (href url in html in simple words).
 /// * `description` - The description of the search result.
 /// * `engine` - The names of the upstream engines from which this results were provided.
-#[derive(Clone)]
-pub struct RawSearchResult {
+#[derive(Clone, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchResult {
    pub title: String,
-    pub visiting_url: String,
+    pub url: String,
    pub description: String,
    pub engine: Vec<String>,
 }

-impl RawSearchResult {
+impl SearchResult {
    /// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
    ///
    /// # Arguments
    ///
    /// * `title` - The title of the search result.
-    /// * `visiting_url` - The url which is accessed when clicked on it
+    /// * `url` - The url which is accessed when clicked on it
    /// (href url in html in simple words).
    /// * `description` - The description of the search result.
    /// * `engine` - The names of the upstream engines from which this results were provided.
-    pub fn new(
-        title: String,
-        visiting_url: String,
-        description: String,
-        engine: Vec<String>,
-    ) -> Self {
-        RawSearchResult {
+    pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
+        SearchResult {
            title,
-            visiting_url,
+            url,
            description,
            engine,
        }
--- a/src/results/aggregator.rs
+++ b/src/results/aggregator.rs
@ -8,18 +8,14 @@ use rand::Rng;
 use tokio::task::JoinHandle;

 use super::{
-    aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
+    aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
    user_agent::random_user_agent,
 };

-use crate::engines::{
-    duckduckgo,
-    engine_models::{EngineError, SearchEngine},
-    searx,
-};
+use crate::engines::engine_models::{EngineError, EngineHandler};

 /// Aliases for long type annotations
-type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
+type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;

 /// The function aggregates the scraped results from the user-selected upstream search engines.
 /// These engines can be chosen either from the user interface (UI) or from the configuration file.
@ -64,139 +60,93 @@ pub async fn aggregate(
    page: u32,
    random_delay: bool,
    debug: bool,
-    upstream_search_engines: Vec<String>,
+    upstream_search_engines: Vec<EngineHandler>,
    request_timeout: u8,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
    let user_agent: String = random_user_agent();
-    let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();

    // Add a random delay before making the request.
    if random_delay || !debug {
        let mut rng = rand::thread_rng();
        let delay_secs = rng.gen_range(1..10);
-        std::thread::sleep(Duration::from_secs(delay_secs));
+        tokio::time::sleep(Duration::from_secs(delay_secs)).await;
    }

-    // fetch results from upstream search engines simultaneously/concurrently.
-    let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
-        .iter()
-        .map(|engine| match engine.to_lowercase().as_str() {
-            "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
-            "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
-            &_ => panic!("Config Error: Incorrect config file option provided"),
-        })
-        .collect();
+    let mut names: Vec<&str> = vec![];

-    let task_capacity: usize = search_engines.len();
+    // create tasks for upstream result fetching
+    let mut tasks: FutureVec = FutureVec::new();

-    let tasks: FutureVec = search_engines
-        .into_iter()
-        .map(|search_engine| {
-            let query: String = query.clone();
-            let user_agent: String = user_agent.clone();
-            tokio::spawn(async move {
-                search_engine
-                    .results(query, page, user_agent.clone(), request_timeout)
-                    .await
-            })
-        })
-        .collect();
+    for engine_handler in upstream_search_engines {
+        let (name, search_engine) = engine_handler.into_name_engine();
+        names.push(name);
+        let query: String = query.clone();
+        let user_agent: String = user_agent.clone();
+        tasks.push(tokio::spawn(async move {
+            search_engine
+                .results(query, page, user_agent.clone(), request_timeout)
+                .await
+        }));
+    }

-    let mut outputs = Vec::with_capacity(task_capacity);
+    // get upstream responses
+    let mut responses = Vec::with_capacity(tasks.len());

    for task in tasks {
        if let Ok(result) = task.await {
-            outputs.push(result)
+            responses.push(result)
        }
    }

+    // aggregate search results, removing duplicates and handling errors the upstream engines returned
+    let mut result_map: HashMap<String, SearchResult> = HashMap::new();
    let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();

-    // The code block `outputs.iter()` determines whether it is the first time the code is being run.
-    // It does this by checking the initial flag. If it is the first time, the code selects the first
-    // engine from which results are fetched and adds or extends them into the `result_map`. If the
-    // initially selected engine fails, the code automatically selects another engine to map or extend
-    // into the `result_map`. On the other hand, if an engine selected for the first time successfully
-    // fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently,
-    // the code iterates through the remaining engines one by one. It compares the fetched results from each
-    // engine with the results already present in the `result_map` to identify any duplicates. If duplicate
-    // results are found, the code groups them together with the name of the engine from which they were
-    // fetched, and automatically removes the duplicate results from the newly fetched data.
-    //
-    // Additionally, the code handles errors returned by the engines. It keeps track of which engines
-    // encountered errors and stores this information in a vector of structures called `EngineErrorInfo`.
-    // Each structure in this vector contains the name of the engine and the type of error it returned.
-    // These structures will later be added to the final `SearchResults` structure. The `SearchResults`
-    // structure is used to display an error box in the UI containing the relevant information from
-    // the `EngineErrorInfo` structure.
-    //
-    // In summary, this code block manages the selection of engines, handling of duplicate results, and tracking
-    // of errors in order to populate the `result_map` and provide informative feedback to the user through the
-    // `SearchResults` structure.
-    let mut initial: bool = true;
-    let mut counter: usize = 0;
-    outputs.iter().for_each(|results| {
-        if initial {
-            match results {
-                Ok(result) => {
-                    result_map.extend(result.clone());
-                    counter += 1;
-                    initial = false
+    let mut handle_error = |error: Report<EngineError>, engine_name: String| {
+        log::error!("Engine Error: {:?}", error);
+        engine_errors_info.push(EngineErrorInfo::new(
+            error.downcast_ref::<EngineError>().unwrap(),
+            engine_name.to_string(),
+        ));
+    };
+
+    for _ in 0..responses.len() {
+        let response = responses.pop().unwrap();
+        let engine = names.pop().unwrap().to_string();
+
+        if result_map.is_empty() {
+            match response {
+                Ok(results) => {
+                    result_map = results.clone();
                }
-                Err(error_type) => {
-                    log::error!("Engine Error: {:?}", error_type);
-                    engine_errors_info.push(EngineErrorInfo::new(
-                        error_type.downcast_ref::<EngineError>().unwrap(),
-                        upstream_search_engines[counter].clone(),
-                    ));
-                    counter += 1
+                Err(error) => {
+                    handle_error(error, engine);
                }
            }
-        } else {
-            match results {
-                Ok(result) => {
-                    result.clone().into_iter().for_each(|(key, value)| {
-                        result_map
-                            .entry(key)
-                            .and_modify(|result| {
-                                result.add_engines(value.clone().engine());
-                            })
-                            .or_insert_with(|| -> RawSearchResult {
-                                RawSearchResult::new(
-                                    value.title.clone(),
-                                    value.visiting_url.clone(),
-                                    value.description.clone(),
-                                    value.engine.clone(),
-                                )
-                            });
-                    });
-                    counter += 1
-                }
-                Err(error_type) => {
-                    log::error!("Engine Error: {:?}", error_type);
-                    engine_errors_info.push(EngineErrorInfo::new(
-                        error_type.downcast_ref::<EngineError>().unwrap(),
-                        upstream_search_engines[counter].clone(),
-                    ));
-                    counter += 1
-                }
+            continue;
+        }
+
+        match response {
+            Ok(result) => {
+                result.into_iter().for_each(|(key, value)| {
+                    result_map
+                        .entry(key)
+                        .and_modify(|result| {
+                            result.add_engines(engine.clone());
+                        })
+                        .or_insert_with(|| -> SearchResult { value });
+                });
+            }
+            Err(error) => {
+                handle_error(error, engine);
            }
        }
-    });
+    }
+
+    let results = result_map.into_values().collect();

    Ok(SearchResults::new(
-        result_map
-            .into_iter()
-            .map(|(key, value)| {
-                SearchResult::new(
-                    value.title,
-                    value.visiting_url,
-                    key,
-                    value.description,
-                    value.engine,
-                )
-            })
-            .collect(),
+        results,
        query.to_string(),
        engine_errors_info,
    ))
--- a/src/server/routes.rs
+++ b/src/server/routes.rs
@ -7,12 +7,14 @@ use std::fs::read_to_string;
 use crate::{
    cache::cacher::RedisCache,
    config::parser::Config,
+    engines::engine_models::EngineHandler,
    handler::public_paths::public_path,
    results::{aggregation_models::SearchResults, aggregator::aggregate},
 };
 use actix_web::{get, web, HttpRequest, HttpResponse};
 use handlebars::Handlebars;
 use serde::Deserialize;
+use tokio::join;

 /// A named struct which deserializes all the user provided search parameters and stores them.
 ///
@ -96,15 +98,49 @@ pub async fn search(
            }
            let page = match &params.page {
                Some(page) => *page,
-                None => 0,
+                None => 1,
            };

-            let url = format!(
-                "http://{}:{}/search?q={}&page={}",
-                config.binding_ip, config.port, query, page
+            let (_, results, _) = join!(
+                results(
+                    format!(
+                        "http://{}:{}/search?q={}&page={}",
+                        config.binding_ip,
+                        config.port,
+                        query,
+                        page - 1
+                    ),
+                    &config,
+                    query.to_string(),
+                    page - 1,
+                    req.clone(),
+                ),
+                results(
+                    format!(
+                        "http://{}:{}/search?q={}&page={}",
+                        config.binding_ip, config.port, query, page
+                    ),
+                    &config,
+                    query.to_string(),
+                    page,
+                    req.clone(),
+                ),
+                results(
+                    format!(
+                        "http://{}:{}/search?q={}&page={}",
+                        config.binding_ip,
+                        config.port,
+                        query,
+                        page + 1
+                    ),
+                    &config,
+                    query.to_string(),
+                    page + 1,
+                    req.clone(),
+                )
            );
-            let results_json = results(url, &config, query.to_string(), page, req).await?;
-            let page_content: String = hbs.render("search", &results_json)?;
+
+            let page_content: String = hbs.render("search", &results?)?;
            Ok(HttpResponse::Ok().body(page_content))
        }
        None => Ok(HttpResponse::Found()
@ -140,12 +176,19 @@ async fn results(
            {
                Some(cookie_value) => {
                    let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
+
+                    let engines = cookie_value
+                        .engines
+                        .iter()
+                        .filter_map(|name| EngineHandler::new(name))
+                        .collect();
+
                    aggregate(
                        query,
                        page,
                        config.aggregator.random_delay,
                        config.debug,
-                        cookie_value.engines,
+                        engines,
                        config.request_timeout,
                    )
                    .await?