diff --git a/Cargo.lock b/Cargo.lock index 241e68f..a406f74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -235,12 +235,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "adler32" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" - [[package]] name = "ahash" version = "0.8.11" @@ -298,12 +292,6 @@ dependencies = [ "tokio 1.39.2", ] -[[package]] -name = "async-once-cell" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9338790e78aa95a416786ec8389546c4b6a1dfc3dc36071ed9518a9413a542eb" - [[package]] name = "async-trait" version = "0.1.81" @@ -1493,16 +1481,6 @@ dependencies = [ "winapi-build", ] -[[package]] -name = "keyword_extraction" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0efa28e79b3a5f72586318c07c24477a169c688e5065fde647c71b3952a2d42" -dependencies = [ - "regex", - "unicode-segmentation", -] - [[package]] name = "language-tags" version = "0.3.2" @@ -1521,26 +1499,6 @@ version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" -[[package]] -name = "libflate" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18" -dependencies = [ - "adler32", - "crc32fast", - "libflate_lz77", -] - -[[package]] -name = "libflate_lz77" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a52d3a8bfc85f250440e4424db7d857e241a3aebbbe301f3eb606ab15c39acbf" -dependencies = [ - "rle-decode-fast", -] - [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -2567,12 +2525,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "rle-decode-fast" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" - [[package]] name = "rustc-demangle" version = "0.1.24" @@ -2952,15 +2904,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" -[[package]] -name = "stop-words" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8500024d809de02ecbf998472b7bed3c4fca380df2be68917f6a473bdb28ddcc" -dependencies = [ - "serde_json", -] - [[package]] name = "string" version = "0.2.1" @@ -3117,25 +3060,6 @@ dependencies = [ "utf-8", ] -[[package]] -name = "thesaurus" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e33ea271e53da683cd3439c04ff3b734f3d6052ea33a65ec9e0fa89a4f96369" -dependencies = [ - "lazy_static", - "thesaurus-moby", -] - -[[package]] -name = "thesaurus-moby" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28f7806d5dbe7d9b627e332f88269a014a6a1d40ec411d4ea66cb702aabce4cf" -dependencies = [ - "libflate", -] - [[package]] name = "thiserror" version = "1.0.63" @@ -3526,12 +3450,6 @@ dependencies = [ "tinyvec", ] -[[package]] -name = "unicode-segmentation" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" - [[package]] name = "unicode-xid" version = "0.1.0" @@ -3741,7 +3659,6 @@ dependencies = [ "actix-files", "actix-web", "async-compression", - "async-once-cell", "async-trait", "cfg-if 1.0.0", "env_logger", @@ -3749,7 +3666,6 @@ dependencies = [ "fake-useragent", "figment", "futures 0.3.30", - "keyword_extraction", "log", "maud", "mini-moka", @@ -3758,9 +3674,7 @@ dependencies = [ "scraper", "serde", "serde_json", - "stop-words", "tempfile", - "thesaurus", "tokio 1.39.2", ] diff --git a/Cargo.toml b/Cargo.toml index aa28a54..7767269 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,7 +41,6 @@ error-stack = { version = "0.4.0", default-features = false, features = [ async-trait = { version = "0.1.80", default-features = false } regex = { version = "1.9.4", features = ["perf"], default-features = false } futures = { version = "0.3.30", default-features = false, features = ["alloc"] } -async-once-cell = { version = "0.5.3", default-features = false } mini-moka = { version = "0.10", default-features = false, features = [ "sync", ] } @@ -50,20 +49,12 @@ async-compression = { version = "0.4.11", default-features = false, features = [ "tokio", ], optional = true } cfg-if = { version = "1.0.0", default-features = false, optional = true } -keyword_extraction = { version = "1.4.3", default-features = false, features = [ - "tf_idf", -] } figment = { version = "0.10", features = ["env"] } -stop-words = { version = "0.8.0", default-features = false, features = ["iso"] } -thesaurus = { version = "0.5.2", default-features = false, optional = true, features = [ - "moby", -] } + [dev-dependencies] tempfile = { version = "3.10.1", default-features = false } [features] -use-synonyms-search = ["thesaurus/static"] compress-cache-results = ["dep:async-compression", "dep:cfg-if"] experimental-io-uring = ["actix-web/experimental-io-uring"] -use-non-static-synonyms-search = ["thesaurus"] diff --git a/src/models/aggregation_models.rs b/src/models/aggregation_models.rs index ca010f3..c3c210b 100644 --- a/src/models/aggregation_models.rs +++ b/src/models/aggregation_models.rs @@ -4,11 +4,6 @@ use super::engine_models::EngineError; use serde::{Deserialize, Serialize}; -#[cfg(any( - feature = "use-synonyms-search", - feature = "use-non-static-synonyms-search" -))] -use thesaurus::synonyms; /// A named struct to store the raw scraped search results scraped search results from the /// upstream search engines before aggregating it.It derives the Clone trait which is needed /// to write idiomatic rust using `Iterators`. @@ -47,45 +42,6 @@ impl SearchResult { engine: engine.iter().map(|name| name.to_string()).collect(), } } - /// calculates and update the relevance score of the current search. - - /// # Arguments - /// - /// * query - the query string used to obtain the results - /// - /// - - pub fn calculate_relevance(&mut self, query: &str) { - use stop_words::{get, LANGUAGE}; - // when language settings can change to any of the ones supported on this crate: https://docs.rs/crate/stop-words/0.8.0 - let documents = [ - self.title.clone(), - self.url.clone(), - self.description.clone(), - ]; - - let stop_words = get(LANGUAGE::English); - let punctuation = [ - ".".to_owned(), - ",".to_owned(), - ":".to_owned(), - ";".to_owned(), - "!".to_owned(), - "?".to_owned(), - "(".to_owned(), - ")".to_owned(), - "[".to_owned(), - "]".to_owned(), - "{".to_owned(), - "}".to_owned(), - "\"".to_owned(), - "'".to_owned(), - "<".to_owned(), - ">".to_owned(), - ]; - - self.relevance_score = calculate_tf_idf(query, &documents, &stop_words, &punctuation); - } /// A function which adds the engine name provided as a string into a vector of strings. /// @@ -228,53 +184,3 @@ impl SearchResults { self.no_engines_selected = true; } } - -/// Helper function to calculate the tf-idf for the search query. -///
The approach is as [`as`](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). -///
Find a sample article about TF-IDF [`here`](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3) -/// ### Arguments -/// * `query` - a user's search query -/// * `documents` - a list of text used for comparision (url, title, description) -/// * `stop_words` - A list of language specific stop words. -/// * `punctuation` - list of punctuation symbols. -/// ### Returns -/// * `score` - The average tf-idf score of the word tokens (and synonyms) in the query -fn calculate_tf_idf( - query: &str, - documents: &[String], - stop_words: &[String], - punctuation: &[String], -) -> f32 { - use keyword_extraction::{ - tf_idf::{TfIdf, TfIdfParams}, - tokenizer::Tokenizer, - }; - - let params = TfIdfParams::UnprocessedDocuments(documents, stop_words, Some(punctuation)); - let tf_idf = TfIdf::new(params); - let tokener = Tokenizer::new(query, stop_words, Some(punctuation)); - let query_tokens = tokener.split_into_words(); - let mut search_tokens = vec![]; - - for token in query_tokens { - #[cfg(any( - feature = "use-synonyms-search", - feature = "use-non-static-synonyms-search" - ))] - { - // find some synonyms and add them to the search (from wordnet or moby if feature is enabled) - let synonyms = synonyms(&token); - search_tokens.extend(synonyms) - } - search_tokens.push(token); - } - - let mut total_score = 0.0f32; - for token in search_tokens.iter() { - total_score += tf_idf.get_score(token); - } - - let result = total_score / (search_tokens.len() as f32); - - f32::from(!result.is_nan()) * result -} diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index 427b876..a7116c6 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -10,15 +10,9 @@ use crate::models::{ use error_stack::Report; use futures::stream::FuturesUnordered; -use regex::Regex; use reqwest::{Client, ClientBuilder}; use std::sync::Arc; -use tokio::{ - fs::File, - io::{AsyncBufReadExt, BufReader}, - task::JoinHandle, - time::Duration, -}; +use tokio::{task::JoinHandle, time::Duration}; /// A constant for holding the prebuilt Client globally in the app. static CLIENT: std::sync::OnceLock = std::sync::OnceLock::new(); @@ -153,77 +147,7 @@ pub async fn aggregate( }; } - let mut results: Vec = result_map - .iter() - .map(|(_, value)| { - let mut copy = value.clone(); - if !copy.url.contains("temu.com") { - copy.calculate_relevance(query.as_str()) - } - copy - }) - .collect(); - sort_search_results(&mut results); + let results: Vec = result_map.iter().map(|(_, value)| value.clone()).collect(); Ok(SearchResults::new(results, &engine_errors_info)) } - -/// Filters a map of search results using a list of regex patterns. -/// -/// # Arguments -/// -/// * `map_to_be_filtered` - A mutable reference to a `Vec` of search results to filter, where the filtered results will be removed from. -/// * `resultant_map` - A mutable reference to a `Vec` to hold the filtered results. -/// * `file_path` - A `&str` representing the path to a file containing regex patterns to use for filtering. -/// -/// # Errors -/// -/// Returns an error if the file at `file_path` cannot be opened or read, or if a regex pattern is invalid. -pub async fn filter_with_lists( - map_to_be_filtered: &mut Vec<(String, SearchResult)>, - resultant_map: &mut Vec<(String, SearchResult)>, - file_path: &str, -) -> Result<(), Box> { - let reader = BufReader::new(File::open(file_path).await?); - let mut lines = reader.lines(); - - while let Some(line) = lines.next_line().await? { - let re = Regex::new(line.trim())?; - - let mut length = map_to_be_filtered.len(); - let mut idx: usize = Default::default(); - // Iterate over each search result in the map and check if it matches the regex pattern - while idx < length { - let ele = &map_to_be_filtered[idx]; - let ele_inner = &ele.1; - match re.is_match(&ele.0.to_lowercase()) - || re.is_match(&ele_inner.title.to_lowercase()) - || re.is_match(&ele_inner.description.to_lowercase()) - { - true => { - // If the search result matches the regex pattern, move it from the original map to the resultant map - resultant_map.push(map_to_be_filtered.swap_remove(idx)); - length -= 1; - } - false => idx += 1, - }; - } - } - - Ok(()) -} -/// Sorts SearchResults by relevance score. -///
sort_unstable is used as its faster,stability is not an issue on our side. -/// For reasons why, check out [`this`](https://rust-lang.github.io/rfcs/1884-unstable-sort.html) -/// # Arguments -/// * `results` - A mutable slice or Vec of SearchResults -/// -fn sort_search_results(results: &mut [SearchResult]) { - results.sort_unstable_by(|a, b| { - use std::cmp::Ordering; - - b.relevance_score - .partial_cmp(&a.relevance_score) - .unwrap_or(Ordering::Less) - }) -}