⚡ perf: several optimizations for improving the performance of the engine (#540)

* ♻️ refactor: initialize & store the config & cache structs as a constant (#486) - initializes & stores the config & cache structs as a static constant. - Pass the config & cache structs as a static reference to all the functions handling their respective route. * ⚡ perf: replace hashmaps with vectors for fetching & aggregating results (#486) - replace hashmaps with vectors for fetching, collecting & aggregating results as it tends to be contigous & cache efficient data structure. - refactor & redesign algorithms for fetching & aggregating results centered around vectors in aggregate function. * ➕ build: add the future crate (#486) * ⚡ perf: use `futureunordered` for collecting results fetched from the tokio spawn tasks (#486) - using the `futureunordered` instead of vector for collecting results reduces the time it takes to fetch the results as the results do not need to come in specific order so any result that gets fetched first gets collected in the `futureunordered` type. Co-authored-by: Spencerjibz <spencernajib2@gmail.com> * ⚡ perf: initialize new async connections parallely using tokio spawn tasks (#486) * ⚡ perf: initialize redis pipeline struct once with the default size of 3 (#486) * ⚡ perf: reduce branch predictions by reducing conditional code branches (#486) * ✅ test(unit): provide unit test for the `get_safesearch_level` function (#486) * ⚡ perf: reduce clones & use index based loop to improve search results filtering performance (#486) * 🚨 fix(clippy): make clippy/format checks happy (#486) * 🚨 fix(build): make the cargo build check happy (#486) * ⚡ perf: reduce the amount of clones, to_owneds & to_strings (#486) * ⚡ perf: use async crates & methods & make functions async (#486) * 🔖 chore(release): bump the app version (#486) --------- Co-authored-by: Spencerjibz <spencernajib2@gmail.com>
2024-03-11 12:01:30 +03:00 · 2024-03-11 12:01:30 +03:00 · 991f3f59de
commit 991f3f59de
parent 8d9b660eb1
25 changed files with 685 additions and 533 deletions
--- a/src/results/aggregator.rs
+++ b/src/results/aggregator.rs
@ -9,22 +9,25 @@ use crate::models::{
    engine_models::{EngineError, EngineHandler},
 };
 use error_stack::Report;
+use futures::stream::FuturesUnordered;
 use regex::Regex;
 use reqwest::{Client, ClientBuilder};
+use std::sync::Arc;
 use std::time::{SystemTime, UNIX_EPOCH};
-use std::{
-    collections::HashMap,
-    io::{BufReader, Read},
+use tokio::{
+    fs::File,
+    io::{AsyncBufReadExt, BufReader},
+    task::JoinHandle,
    time::Duration,
 };
-use std::{fs::File, io::BufRead};
-use tokio::task::JoinHandle;

 /// A constant for holding the prebuilt Client globally in the app.
 static CLIENT: std::sync::OnceLock<Client> = std::sync::OnceLock::new();

 /// Aliases for long type annotations
-type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
+
+type FutureVec =
+    FuturesUnordered<JoinHandle<Result<Vec<(String, SearchResult)>, Report<EngineError>>>>;

 /// The function aggregates the scraped results from the user-selected upstream search engines.
 /// These engines can be chosen either from the user interface (UI) or from the configuration file.
@ -37,7 +40,7 @@ type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<Eng
 ///
 /// Additionally, the function eliminates duplicate results. If two results are identified as coming from
 /// multiple engines, their names are combined to indicate that the results were fetched from these upstream
-/// engines. After this, all the data in the `HashMap` is removed and placed into a struct that contains all
+/// engines. After this, all the data in the `Vec` is removed and placed into a struct that contains all
 /// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is
 /// necessary to ensure that the search bar in the search remains populated even when searched from the query URL.
 ///
@ -94,15 +97,22 @@ pub async fn aggregate(
    let mut names: Vec<&str> = Vec::with_capacity(0);

    // create tasks for upstream result fetching
-    let mut tasks: FutureVec = FutureVec::new();
+    let tasks: FutureVec = FutureVec::new();

+    let query: Arc<String> = Arc::new(query.to_string());
    for engine_handler in upstream_search_engines {
-        let (name, search_engine) = engine_handler.to_owned().into_name_engine();
+        let (name, search_engine) = engine_handler.clone().into_name_engine();
        names.push(name);
-        let query: String = query.to_owned();
+        let query_partially_cloned = query.clone();
        tasks.push(tokio::spawn(async move {
            search_engine
-                .results(&query, page, user_agent, client, safe_search)
+                .results(
+                    &query_partially_cloned,
+                    page,
+                    user_agent,
+                    client,
+                    safe_search,
+                )
                .await
        }));
    }
@ -117,7 +127,7 @@ pub async fn aggregate(
    }

    // aggregate search results, removing duplicates and handling errors the upstream engines returned
-    let mut result_map: HashMap<String, SearchResult> = HashMap::new();
+    let mut result_map: Vec<(String, SearchResult)> = Vec::new();
    let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();

    let mut handle_error = |error: &Report<EngineError>, engine_name: &'static str| {
@ -134,51 +144,45 @@ pub async fn aggregate(

        if result_map.is_empty() {
            match response {
-                Ok(results) => {
-                    result_map = results.clone();
-                }
-                Err(error) => {
-                    handle_error(&error, engine);
-                }
-            }
+                Ok(results) => result_map = results,
+                Err(error) => handle_error(&error, engine),
+            };
            continue;
        }

        match response {
            Ok(result) => {
                result.into_iter().for_each(|(key, value)| {
-                    result_map
-                        .entry(key)
-                        .and_modify(|result| {
-                            result.add_engines(engine);
-                        })
-                        .or_insert_with(|| -> SearchResult { value });
+                    match result_map.iter().find(|(key_s, _)| key_s == &key) {
+                        Some(value) => value.1.to_owned().add_engines(engine),
+                        None => result_map.push((key, value)),
+                    };
                });
            }
-            Err(error) => {
-                handle_error(&error, engine);
-            }
-        }
+            Err(error) => handle_error(&error, engine),
+        };
    }

    if safe_search >= 3 {
-        let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
+        let mut blacklist_map: Vec<(String, SearchResult)> = Vec::new();
        filter_with_lists(
            &mut result_map,
            &mut blacklist_map,
            file_path(FileType::BlockList)?,
-        )?;
+        )
+        .await?;

        filter_with_lists(
            &mut blacklist_map,
            &mut result_map,
            file_path(FileType::AllowList)?,
-        )?;
+        )
+        .await?;

        drop(blacklist_map);
    }

-    let results: Vec<SearchResult> = result_map.into_values().collect();
+    let results: Vec<SearchResult> = result_map.iter().map(|(_, value)| value.clone()).collect();

    Ok(SearchResults::new(results, &engine_errors_info))
 }
@ -187,35 +191,41 @@ pub async fn aggregate(
 ///
 /// # Arguments
 ///
-/// * `map_to_be_filtered` - A mutable reference to a `HashMap` of search results to filter, where the filtered results will be removed from.
-/// * `resultant_map` - A mutable reference to a `HashMap` to hold the filtered results.
+/// * `map_to_be_filtered` - A mutable reference to a `Vec` of search results to filter, where the filtered results will be removed from.
+/// * `resultant_map` - A mutable reference to a `Vec` to hold the filtered results.
 /// * `file_path` - A `&str` representing the path to a file containing regex patterns to use for filtering.
 ///
 /// # Errors
 ///
 /// Returns an error if the file at `file_path` cannot be opened or read, or if a regex pattern is invalid.
-pub fn filter_with_lists(
-    map_to_be_filtered: &mut HashMap<String, SearchResult>,
-    resultant_map: &mut HashMap<String, SearchResult>,
+pub async fn filter_with_lists(
+    map_to_be_filtered: &mut Vec<(String, SearchResult)>,
+    resultant_map: &mut Vec<(String, SearchResult)>,
    file_path: &str,
 ) -> Result<(), Box<dyn std::error::Error>> {
-    let mut reader = BufReader::new(File::open(file_path)?);
+    let reader = BufReader::new(File::open(file_path).await?);
+    let mut lines = reader.lines();

-    for line in reader.by_ref().lines() {
-        let re = Regex::new(line?.trim())?;
+    while let Some(line) = lines.next_line().await? {
+        let re = Regex::new(line.trim())?;

+        let mut length = map_to_be_filtered.len();
+        let mut idx: usize = Default::default();
        // Iterate over each search result in the map and check if it matches the regex pattern
-        for (url, search_result) in map_to_be_filtered.clone().into_iter() {
-            if re.is_match(&url.to_lowercase())
-                || re.is_match(&search_result.title.to_lowercase())
-                || re.is_match(&search_result.description.to_lowercase())
+        while idx < length {
+            let ele = &map_to_be_filtered[idx];
+            let ele_inner = &ele.1;
+            match re.is_match(&ele.0.to_lowercase())
+                || re.is_match(&ele_inner.title.to_lowercase())
+                || re.is_match(&ele_inner.description.to_lowercase())
            {
-                // If the search result matches the regex pattern, move it from the original map to the resultant map
-                resultant_map.insert(
-                    url.to_owned(),
-                    map_to_be_filtered.remove(&url.to_owned()).unwrap(),
-                );
-            }
+                true => {
+                    // If the search result matches the regex pattern, move it from the original map to the resultant map
+                    resultant_map.push(map_to_be_filtered.swap_remove(idx));
+                    length -= 1;
+                }
+                false => idx += 1,
+            };
        }
    }

@ -226,15 +236,14 @@ pub fn filter_with_lists(
 mod tests {
    use super::*;
    use smallvec::smallvec;
-    use std::collections::HashMap;
    use std::io::Write;
    use tempfile::NamedTempFile;

-    #[test]
-    fn test_filter_with_lists() -> Result<(), Box<dyn std::error::Error>> {
+    #[tokio::test]
+    async fn test_filter_with_lists() -> Result<(), Box<dyn std::error::Error>> {
        // Create a map of search results to filter
-        let mut map_to_be_filtered = HashMap::new();
-        map_to_be_filtered.insert(
+        let mut map_to_be_filtered = Vec::new();
+        map_to_be_filtered.push((
            "https://www.example.com".to_owned(),
            SearchResult {
                title: "Example Domain".to_owned(),
@ -243,15 +252,15 @@ mod tests {
                    .to_owned(),
                engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
            },
-        );
-        map_to_be_filtered.insert(
+        ));
+        map_to_be_filtered.push((
            "https://www.rust-lang.org/".to_owned(),
            SearchResult {
                title: "Rust Programming Language".to_owned(),
                url: "https://www.rust-lang.org/".to_owned(),
                description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
                engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
-            },
+            },)
        );

        // Create a temporary file with regex patterns
@ -260,25 +269,30 @@ mod tests {
        writeln!(file, "rust")?;
        file.flush()?;

-        let mut resultant_map = HashMap::new();
+        let mut resultant_map = Vec::new();
        filter_with_lists(
            &mut map_to_be_filtered,
            &mut resultant_map,
            file.path().to_str().unwrap(),
-        )?;
+        )
+        .await?;

        assert_eq!(resultant_map.len(), 2);
-        assert!(resultant_map.contains_key("https://www.example.com"));
-        assert!(resultant_map.contains_key("https://www.rust-lang.org/"));
+        assert!(resultant_map
+            .iter()
+            .any(|(key, _)| key == "https://www.example.com"));
+        assert!(resultant_map
+            .iter()
+            .any(|(key, _)| key == "https://www.rust-lang.org/"));
        assert_eq!(map_to_be_filtered.len(), 0);

        Ok(())
    }

-    #[test]
-    fn test_filter_with_lists_wildcard() -> Result<(), Box<dyn std::error::Error>> {
-        let mut map_to_be_filtered = HashMap::new();
-        map_to_be_filtered.insert(
+    #[tokio::test]
+    async fn test_filter_with_lists_wildcard() -> Result<(), Box<dyn std::error::Error>> {
+        let mut map_to_be_filtered = Vec::new();
+        map_to_be_filtered.push((
            "https://www.example.com".to_owned(),
            SearchResult {
                title: "Example Domain".to_owned(),
@ -287,8 +301,8 @@ mod tests {
                    .to_owned(),
                engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
            },
-        );
-        map_to_be_filtered.insert(
+        ));
+        map_to_be_filtered.push((
            "https://www.rust-lang.org/".to_owned(),
            SearchResult {
                title: "Rust Programming Language".to_owned(),
@ -296,34 +310,39 @@ mod tests {
                description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
                engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
            },
-        );
+        ));

        // Create a temporary file with a regex pattern containing a wildcard
        let mut file = NamedTempFile::new()?;
        writeln!(file, "ex.*le")?;
        file.flush()?;

-        let mut resultant_map = HashMap::new();
+        let mut resultant_map = Vec::new();

        filter_with_lists(
            &mut map_to_be_filtered,
            &mut resultant_map,
            file.path().to_str().unwrap(),
-        )?;
+        )
+        .await?;

        assert_eq!(resultant_map.len(), 1);
-        assert!(resultant_map.contains_key("https://www.example.com"));
+        assert!(resultant_map
+            .iter()
+            .any(|(key, _)| key == "https://www.example.com"));
        assert_eq!(map_to_be_filtered.len(), 1);
-        assert!(map_to_be_filtered.contains_key("https://www.rust-lang.org/"));
+        assert!(map_to_be_filtered
+            .iter()
+            .any(|(key, _)| key == "https://www.rust-lang.org/"));

        Ok(())
    }

-    #[test]
-    fn test_filter_with_lists_file_not_found() {
-        let mut map_to_be_filtered = HashMap::new();
+    #[tokio::test]
+    async fn test_filter_with_lists_file_not_found() {
+        let mut map_to_be_filtered = Vec::new();

-        let mut resultant_map = HashMap::new();
+        let mut resultant_map = Vec::new();

        // Call the `filter_with_lists` function with a non-existent file path
        let result = filter_with_lists(
@ -332,13 +351,13 @@ mod tests {
            "non-existent-file.txt",
        );

-        assert!(result.is_err());
+        assert!(result.await.is_err());
    }

-    #[test]
-    fn test_filter_with_lists_invalid_regex() {
-        let mut map_to_be_filtered = HashMap::new();
-        map_to_be_filtered.insert(
+    #[tokio::test]
+    async fn test_filter_with_lists_invalid_regex() {
+        let mut map_to_be_filtered = Vec::new();
+        map_to_be_filtered.push((
            "https://www.example.com".to_owned(),
            SearchResult {
                title: "Example Domain".to_owned(),
@ -347,9 +366,9 @@ mod tests {
                    .to_owned(),
                engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
            },
-        );
+        ));

-        let mut resultant_map = HashMap::new();
+        let mut resultant_map = Vec::new();

        // Create a temporary file with an invalid regex pattern
        let mut file = NamedTempFile::new().unwrap();
@ -362,6 +381,6 @@ mod tests {
            file.path().to_str().unwrap(),
        );

-        assert!(result.is_err());
+        assert!(result.await.is_err());
    }
 }