✨ Ranking of aggregated search results based on relevancy of the search result to the user's search query (#549)

* add sorting by relevance and merge new changes * fix conflicts * Update src/models/aggregation_models.rs Co-authored-by: neon_arch <mustafadhuleb53@gmail.com> * Update src/models/aggregation_models.rs Co-authored-by: neon_arch <mustafadhuleb53@gmail.com> * Update Cargo.toml Co-authored-by: neon_arch <mustafadhuleb53@gmail.com> * Update Cargo.toml Co-authored-by: neon_arch <mustafadhuleb53@gmail.com> * Update Cargo.toml Co-authored-by: neon_arch <mustafadhuleb53@gmail.com> * enable non-static-synonyms features --------- Co-authored-by: neon_arch <mustafadhuleb53@gmail.com>
2024-03-25 09:16:49 +00:00 · 2024-03-25 09:16:49 +00:00 · bb50e8bb25
commit bb50e8bb25
parent c584a7d601
4 changed files with 296 additions and 41 deletions
--- a/src/models/aggregation_models.rs
+++ b/src/models/aggregation_models.rs
@ -4,7 +4,11 @@
 use super::engine_models::EngineError;
 use serde::{Deserialize, Serialize};
 use smallvec::SmallVec;
-
+#[cfg(any(
+    feature = "use-synonyms-search",
+    feature = "use-non-static-synonyms-search"
+))]
+use thesaurus::synonyms;
 /// A named struct to store the raw scraped search results scraped search results from the
 /// upstream search engines before aggregating it.It derives the Clone trait which is needed
 /// to write idiomatic rust using `Iterators`.
@ -20,6 +24,8 @@ pub struct SearchResult {
    pub description: String,
    /// The names of the upstream engines from which this results were provided.
    pub engine: SmallVec<[String; 0]>,
+    /// The td-tdf score of the result in regards to the title, url and description and the user's query
+    pub relevance_score: f32,
 }

 impl SearchResult {
@ -37,9 +43,49 @@ impl SearchResult {
            title: title.to_owned(),
            url: url.to_owned(),
            description: description.to_owned(),
+            relevance_score: 0.0,
            engine: engine.iter().map(|name| name.to_string()).collect(),
        }
    }
+    /// calculates and update the relevance score of the current search.
+
+    /// # Arguments
+    ///
+    /// * query -  the query string  used to obtain the results
+    ///
+    ///
+
+    pub fn calculate_relevance(&mut self, query: &str) {
+        use stop_words::{get, LANGUAGE};
+        // when language settings can change to any of the ones supported on this crate: https://docs.rs/crate/stop-words/0.8.0
+        let documents = [
+            self.title.clone(),
+            self.url.clone(),
+            self.description.clone(),
+        ];
+
+        let stop_words = get(LANGUAGE::English);
+        let punctuation = [
+            ".".to_owned(),
+            ",".to_owned(),
+            ":".to_owned(),
+            ";".to_owned(),
+            "!".to_owned(),
+            "?".to_owned(),
+            "(".to_owned(),
+            ")".to_owned(),
+            "[".to_owned(),
+            "]".to_owned(),
+            "{".to_owned(),
+            "}".to_owned(),
+            "\"".to_owned(),
+            "'".to_owned(),
+            "<".to_owned(),
+            ">".to_owned(),
+        ];
+
+        self.relevance_score = calculate_tf_idf(query, &documents, &stop_words, &punctuation);
+    }

    /// A function which adds the engine name provided as a string into a vector of strings.
    ///
@ -182,3 +228,53 @@ impl SearchResults {
        self.no_engines_selected = true;
    }
 }
+
+/// Helper function to calculate the tf-idf for the search query.
+/// <br> The approach is  as [`as`](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).
+///  <br> Find a sample article about TF-IDF [`here`](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3)
+/// ### Arguments
+/// * `query` -  a user's search query
+/// * `documents` -  a list of text used for comparision (url, title, description)
+/// * `stop_words` - A list of language specific stop words.
+/// * `punctuation` - list of punctuation symbols.
+/// ### Returns
+/// * `score` - The average tf-idf score of the word tokens (and synonyms) in the query
+fn calculate_tf_idf(
+    query: &str,
+    documents: &[String],
+    stop_words: &[String],
+    punctuation: &[String],
+) -> f32 {
+    use keyword_extraction::{
+        tf_idf::{TfIdf, TfIdfParams},
+        tokenizer::Tokenizer,
+    };
+
+    let params = TfIdfParams::UnprocessedDocuments(documents, stop_words, Some(punctuation));
+    let tf_idf = TfIdf::new(params);
+    let tokener = Tokenizer::new(query, stop_words, Some(punctuation));
+    let query_tokens = tokener.split_into_words();
+    let mut search_tokens = vec![];
+
+    for token in query_tokens {
+        #[cfg(any(
+            feature = "use-synonyms-search",
+            feature = "use-non-static-synonyms-search"
+        ))]
+        {
+            // find some synonyms and add them to the search  (from wordnet or moby if feature is enabled)
+            let synonyms = synonyms(&token);
+            search_tokens.extend(synonyms)
+        }
+        search_tokens.push(token);
+    }
+
+    let mut total_score = 0.0f32;
+    for token in search_tokens.iter() {
+        total_score += tf_idf.get_score(token);
+    }
+
+    let result = total_score / (search_tokens.len() as f32);
+
+    f32::from(!result.is_nan()) * result
+}
--- a/src/results/aggregator.rs
+++ b/src/results/aggregator.rs
@ -8,6 +8,7 @@ use crate::models::{
    aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
    engine_models::{EngineError, EngineHandler},
 };
+
 use error_stack::Report;
 use futures::stream::FuturesUnordered;
 use regex::Regex;
@ -184,7 +185,17 @@ pub async fn aggregate(
        drop(blacklist_map);
    }

-    let results: Vec<SearchResult> = result_map.iter().map(|(_, value)| value.clone()).collect();
+    let mut results: Vec<SearchResult> = result_map
+        .iter()
+        .map(|(_, value)| {
+            let mut copy = value.clone();
+            if !copy.url.contains("temu.com") {
+                copy.calculate_relevance(query.as_str())
+            }
+            copy
+        })
+        .collect();
+    sort_search_results(&mut results);

    Ok(SearchResults::new(results, &engine_errors_info))
 }
@ -233,7 +244,21 @@ pub async fn filter_with_lists(

    Ok(())
 }
+/// Sorts  SearchResults by relevance score.
+/// <br> sort_unstable is used as its faster,stability is not an issue on our side.
+/// For reasons why, check out [`this`](https://rust-lang.github.io/rfcs/1884-unstable-sort.html)
+///  # Arguments
+///  * `results` - A mutable slice or Vec of SearchResults
+///  
+fn sort_search_results(results: &mut [SearchResult]) {
+    results.sort_unstable_by(|a, b| {
+        use std::cmp::Ordering;

+        b.relevance_score
+            .partial_cmp(&a.relevance_score)
+            .unwrap_or(Ordering::Less)
+    })
+}
 #[cfg(test)]
 mod tests {
    use super::*;
@ -252,6 +277,7 @@ mod tests {
                url: "https://www.example.com".to_owned(),
                description: "This domain is for use in illustrative examples in documents."
                    .to_owned(),
+                relevance_score: 0.0,
                engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
            },
        ));
@ -262,6 +288,7 @@ mod tests {
                url: "https://www.rust-lang.org/".to_owned(),
                description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
                engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
+                relevance_score:0.0
            },)
        );

@ -302,6 +329,7 @@ mod tests {
                description: "This domain is for use in illustrative examples in documents."
                    .to_owned(),
                engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
+                relevance_score: 0.0,
            },
        ));
        map_to_be_filtered.push((
@ -311,6 +339,7 @@ mod tests {
                url: "https://www.rust-lang.org/".to_owned(),
                description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
                engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
+                relevance_score:0.0
            },
        ));

@ -367,6 +396,7 @@ mod tests {
                description: "This domain is for use in illustrative examples in documents."
                    .to_owned(),
                engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
+                relevance_score: 0.0,
            },
        ));