Merge branch 'rolling' into improve-async-multithreading

2023-07-15 19:50:31 +03:00 · 2023-07-15 19:50:31 +03:00 · 74e4fc6169
commit 74e4fc6169
parent 1c2ea24024 e4625c3fd0
51 changed files with 548 additions and 339 deletions
--- a/src/results/aggregation_models.rs
+++ b/src/results/aggregation_models.rs
@ -0,0 +1,200 @@
+//! This module provides public models for handling, storing and serializing of search results
+//! data scraped from the upstream search engines.
+
+use serde::{Deserialize, Serialize};
+
+use crate::{config::parser_models::Style, engines::engine_models::EngineError};
+
+/// A named struct to store, serialize and deserializes the individual search result from all the
+/// scraped and aggregated search results from the upstream search engines.
+///
+/// # Fields
+///
+/// * `title` - The title of the search result.
+/// * `visiting_url` - The url which is accessed when clicked on it (href url in html in simple
+/// words).
+/// * `url` - The url to be displayed below the search result title in html.
+/// * `description` - The description of the search result.
+/// * `engine` - The names of the upstream engines from which this results were provided.
+#[derive(Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchResult {
+    pub title: String,
+    pub visiting_url: String,
+    pub url: String,
+    pub description: String,
+    pub engine: Vec<String>,
+}
+
+impl SearchResult {
+    /// Constructs a new `SearchResult` with the given arguments needed for the struct.
+    ///
+    /// # Arguments
+    ///
+    /// * `title` - The title of the search result.
+    /// * `visiting_url` - The url which is accessed when clicked on it
+    /// (href url in html in simple words).
+    /// * `url` - The url to be displayed below the search result title in html.
+    /// * `description` - The description of the search result.
+    /// * `engine` - The names of the upstream engines from which this results were provided.
+    pub fn new(
+        title: String,
+        visiting_url: String,
+        url: String,
+        description: String,
+        engine: Vec<String>,
+    ) -> Self {
+        SearchResult {
+            title,
+            visiting_url,
+            url,
+            description,
+            engine,
+        }
+    }
+}
+
+/// A named struct to store the raw scraped search results scraped search results from the
+/// upstream search engines before aggregating it.It derives the Clone trait which is needed
+/// to write idiomatic rust using `Iterators`.
+///
+/// # Fields
+///
+/// * `title` - The title of the search result.
+/// * `visiting_url` - The url which is accessed when clicked on it
+/// (href url in html in simple words).
+/// * `description` - The description of the search result.
+/// * `engine` - The names of the upstream engines from which this results were provided.
+#[derive(Clone)]
+pub struct RawSearchResult {
+    pub title: String,
+    pub visiting_url: String,
+    pub description: String,
+    pub engine: Vec<String>,
+}
+
+impl RawSearchResult {
+    /// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
+    ///
+    /// # Arguments
+    ///
+    /// * `title` - The title of the search result.
+    /// * `visiting_url` - The url which is accessed when clicked on it
+    /// (href url in html in simple words).
+    /// * `description` - The description of the search result.
+    /// * `engine` - The names of the upstream engines from which this results were provided.
+    pub fn new(
+        title: String,
+        visiting_url: String,
+        description: String,
+        engine: Vec<String>,
+    ) -> Self {
+        RawSearchResult {
+            title,
+            visiting_url,
+            description,
+            engine,
+        }
+    }
+
+    /// A function which adds the engine name provided as a string into a vector of strings.
+    ///
+    /// # Arguments
+    ///
+    /// * `engine` - Takes an engine name provided as a String.
+    pub fn add_engines(&mut self, engine: String) {
+        self.engine.push(engine)
+    }
+
+    /// A function which returns the engine name stored from the struct as a string.
+    ///
+    /// # Returns
+    ///
+    /// An engine name stored as a string from the struct.
+    pub fn engine(self) -> String {
+        self.engine.get(0).unwrap().to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct EngineErrorInfo {
+    pub error: String,
+    pub engine: String,
+}
+
+impl EngineErrorInfo {
+    pub fn new(error: &EngineError, engine: String) -> Self {
+        Self {
+            error: match error {
+                EngineError::RequestError => String::from("RequestError"),
+                EngineError::EmptyResultSet => String::from("EmptyResultSet"),
+                EngineError::UnexpectedError => String::from("UnexpectedError"),
+            },
+            engine,
+        }
+    }
+}
+
+/// A named struct to store, serialize, deserialize the all the search results scraped and
+/// aggregated from the upstream search engines.
+///
+/// # Fields
+///
+/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
+/// `SearchResult` structs.
+/// * `page_query` - Stores the current pages search query `q` provided in the search url.
+/// * `style` - Stores the theming options for the website.
+/// * `engine_errors_info` - Stores the information on which engines failed with their engine name
+/// and the type of error that caused it.
+/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
+/// given search query.
+#[derive(Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchResults {
+    pub results: Vec<SearchResult>,
+    pub page_query: String,
+    pub style: Style,
+    pub engine_errors_info: Vec<EngineErrorInfo>,
+    pub empty_result_set: bool,
+}
+
+impl SearchResults {
+    /// Constructs a new `SearchResult` with the given arguments needed for the struct.
+    ///
+    /// # Arguments
+    ///
+    /// * `results` - Takes an argument of individual serializable `SearchResult` struct
+    /// and stores it into a vector of `SearchResult` structs.
+    /// * `page_query` - Takes an argument of current page`s search query `q` provided in
+    /// the search url.
+    /// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
+    /// given search query.
+    pub fn new(
+        results: Vec<SearchResult>,
+        page_query: String,
+        engine_errors_info: Vec<EngineErrorInfo>,
+    ) -> Self {
+        SearchResults {
+            results,
+            page_query,
+            style: Style::new("".to_string(), "".to_string()),
+            engine_errors_info,
+            empty_result_set: false,
+        }
+    }
+
+    /// A setter function to add website style to the return search results.
+    pub fn add_style(&mut self, style: Style) {
+        self.style = style;
+    }
+
+    /// A function which checks whether the results stored are empty or not.
+    pub fn is_empty_result_set(&self) -> bool {
+        self.results.is_empty()
+    }
+
+    /// A setter function which sets the empty_result_set to true.
+    pub fn set_empty_result_set(&mut self) {
+        self.empty_result_set = true;
+    }
+}
--- a/src/results/aggregator.rs
+++ b/src/results/aggregator.rs
@ -0,0 +1,171 @@
+//! This module provides the functionality to scrape and gathers all the results from the upstream
+//! search engines and then removes duplicate results.
+
+use std::{collections::HashMap, time::Duration};
+
+use error_stack::Report;
+use rand::Rng;
+use tokio::task::JoinHandle;
+
+use super::{
+    aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
+    user_agent::random_user_agent,
+};
+
+use crate::engines::{
+    duckduckgo,
+    engine_models::{EngineError, SearchEngine},
+    searx,
+};
+
+/// Aliases for long type annotations
+type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
+
+/// A function that aggregates all the scraped results from the above user selected upstream
+/// search engines either selected from the UI or from the config file which is handled by the code
+/// by matching over the selected search engines and adding the selected ones to the vector which
+/// is then used to create an async task vector with `tokio::spawn` which returns a future which
+/// is then awaited on in another loop and then all the collected results is filtered for errors
+/// and proper results and if an error is found is then sent to the UI with the engine name and the
+/// error type that caused it by putting them finallt in the returned `SearchResults` struct. Also
+/// the same process also removes duplicate results and if two results are found to be from two or
+/// more engines then puts their names together to show the results are fetched from these upstream
+/// engines and then removes all data from the HashMap and puts into a struct of all results aggregated
+/// into a vector and also adds the query used into the struct this is neccessory because otherwise the
+/// search bar in search remains empty if searched from the query url.
+///
+/// # Example:
+///
+/// If you search from the url like `https://127.0.0.1/search?q=huston` then the search bar should
+/// contain the word huston and not remain empty.
+///
+/// # Arguments
+///
+/// * `query` - Accepts a string to query with the above upstream search engines.
+/// * `page` - Accepts an u32 page number.
+/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
+/// * `debug` - Accepts a boolean value to enable or disable debug mode option.
+/// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the
+/// user through the UI or the config file.
+///
+/// # Error
+///
+/// Returns an error a reqwest and scraping selector errors if any error occurs in the results
+/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
+/// containing appropriate values.
+pub async fn aggregate(
+    query: String,
+    page: u32,
+    random_delay: bool,
+    debug: bool,
+    upstream_search_engines: Vec<String>,
+) -> Result<SearchResults, Box<dyn std::error::Error>> {
+    let user_agent: String = random_user_agent();
+    let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
+
+    // Add a random delay before making the request.
+    if random_delay || !debug {
+        let mut rng = rand::thread_rng();
+        let delay_secs = rng.gen_range(1..10);
+        std::thread::sleep(Duration::from_secs(delay_secs));
+    }
+
+    // fetch results from upstream search engines simultaneously/concurrently.
+    let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
+        .iter()
+        .map(|engine| match engine.to_lowercase().as_str() {
+            "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
+            "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
+            &_ => panic!("Config Error: Incorrect config file option provided"),
+        })
+        .collect();
+
+    let task_capacity: usize = search_engines.len();
+
+    let tasks: FutureVec = search_engines
+        .into_iter()
+        .map(|search_engine| {
+            let query: String = query.clone();
+            let user_agent: String = user_agent.clone();
+            tokio::spawn(
+                async move { search_engine.results(query, page, user_agent.clone()).await },
+            )
+        })
+        .collect();
+
+    let mut outputs = Vec::with_capacity(task_capacity);
+
+    for task in tasks {
+        if let Ok(result) = task.await {
+            outputs.push(result)
+        }
+    }
+
+    let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
+
+    let mut initial: bool = true;
+    let mut counter: usize = 0;
+    outputs.iter().for_each(|results| {
+        if initial {
+            match results {
+                Ok(result) => {
+                    result_map.extend(result.clone());
+                    counter += 1;
+                    initial = false
+                }
+                Err(error_type) => {
+                    engine_errors_info.push(EngineErrorInfo::new(
+                        error_type.downcast_ref::<EngineError>().unwrap(),
+                        upstream_search_engines[counter].clone(),
+                    ));
+                    counter += 1
+                }
+            }
+        } else {
+            match results {
+                Ok(result) => {
+                    result.clone().into_iter().for_each(|(key, value)| {
+                        result_map
+                            .entry(key)
+                            .and_modify(|result| {
+                                result.add_engines(value.clone().engine());
+                            })
+                            .or_insert_with(|| -> RawSearchResult {
+                                RawSearchResult::new(
+                                    value.title.clone(),
+                                    value.visiting_url.clone(),
+                                    value.description.clone(),
+                                    value.engine.clone(),
+                                )
+                            });
+                    });
+                    counter += 1
+                }
+                Err(error_type) => {
+                    engine_errors_info.push(EngineErrorInfo::new(
+                        error_type.downcast_ref::<EngineError>().unwrap(),
+                        upstream_search_engines[counter].clone(),
+                    ));
+                    counter += 1
+                }
+            }
+        }
+    });
+
+    Ok(SearchResults::new(
+        result_map
+            .into_iter()
+            .map(|(key, value)| {
+                SearchResult::new(
+                    value.title,
+                    value.visiting_url,
+                    key,
+                    value.description,
+                    value.engine,
+                )
+            })
+            .collect(),
+        query.to_string(),
+        engine_errors_info,
+    ))
+}
--- a/src/results/mod.rs
+++ b/src/results/mod.rs
@ -0,0 +1,3 @@
+pub mod aggregation_models;
+pub mod aggregator;
+pub mod user_agent;
--- a/src/results/user_agent.rs
+++ b/src/results/user_agent.rs
@ -0,0 +1,28 @@
+//! This module provides the functionality to generate random user agent string.
+
+use fake_useragent::{Browsers, UserAgents, UserAgentsBuilder};
+
+static USER_AGENTS: once_cell::sync::Lazy<UserAgents> = once_cell::sync::Lazy::new(|| {
+    UserAgentsBuilder::new()
+        .cache(false)
+        .dir("/tmp")
+        .thread(1)
+        .set_browsers(
+            Browsers::new()
+                .set_chrome()
+                .set_safari()
+                .set_edge()
+                .set_firefox()
+                .set_mozilla(),
+        )
+        .build()
+});
+
+/// A function to generate random user agent to improve privacy of the user.
+///
+/// # Returns
+///
+/// A randomly generated user agent string.
+pub fn random_user_agent() -> String {
+    USER_AGENTS.random().to_string()
+}