2023-04-27 17:53:28 +03:00
//! This module provides public models for handling, storing and serializing of search results
//! data scraped from the upstream search engines.
2023-11-18 21:38:02 +03:00
use super ::engine_models ::EngineError ;
2023-05-02 11:58:21 +03:00
use serde ::{ Deserialize , Serialize } ;
2023-08-27 21:02:23 +03:00
use smallvec ::SmallVec ;
2024-03-25 09:16:49 +00:00
#[ cfg(any(
feature = " use-synonyms-search " ,
feature = " use-non-static-synonyms-search "
) ) ]
use thesaurus ::synonyms ;
2023-04-27 17:53:28 +03:00
/// A named struct to store the raw scraped search results scraped search results from the
/// upstream search engines before aggregating it.It derives the Clone trait which is needed
/// to write idiomatic rust using `Iterators`.
/// (href url in html in simple words).
2023-08-18 10:43:53 +02:00
#[ derive(Clone, Serialize, Deserialize) ]
2023-04-22 14:35:07 +03:00
#[ serde(rename_all = " camelCase " ) ]
pub struct SearchResult {
2023-09-03 19:23:34 +03:00
/// The title of the search result.
2023-04-22 14:35:07 +03:00
pub title : String ,
2023-09-03 19:23:34 +03:00
/// The url which is accessed when clicked on it
2023-04-22 14:35:07 +03:00
pub url : String ,
2023-09-03 19:23:34 +03:00
/// The description of the search result.
2023-04-22 14:35:07 +03:00
pub description : String ,
2023-09-03 19:23:34 +03:00
/// The names of the upstream engines from which this results were provided.
2023-08-27 21:02:23 +03:00
pub engine : SmallVec < [ String ; 0 ] > ,
2024-03-25 09:16:49 +00:00
/// The td-tdf score of the result in regards to the title, url and description and the user's query
pub relevance_score : f32 ,
2023-04-22 14:35:07 +03:00
}
2023-04-25 16:30:04 +03:00
impl SearchResult {
2023-04-27 17:53:28 +03:00
/// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `title` - The title of the search result.
2023-08-17 22:48:20 +02:00
/// * `url` - The url which is accessed when clicked on it
2023-04-27 17:53:28 +03:00
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
2023-08-27 21:02:23 +03:00
pub fn new ( title : & str , url : & str , description : & str , engine : & [ & str ] ) -> Self {
2023-04-25 16:30:04 +03:00
SearchResult {
2023-08-27 21:02:23 +03:00
title : title . to_owned ( ) ,
url : url . to_owned ( ) ,
description : description . to_owned ( ) ,
2024-03-25 09:16:49 +00:00
relevance_score : 0.0 ,
2023-08-27 21:02:23 +03:00
engine : engine . iter ( ) . map ( | name | name . to_string ( ) ) . collect ( ) ,
2023-04-25 16:30:04 +03:00
}
}
2024-03-25 09:16:49 +00:00
/// calculates and update the relevance score of the current search.
/// # Arguments
///
/// * query - the query string used to obtain the results
///
///
pub fn calculate_relevance ( & mut self , query : & str ) {
use stop_words ::{ get , LANGUAGE } ;
// when language settings can change to any of the ones supported on this crate: https://docs.rs/crate/stop-words/0.8.0
let documents = [
self . title . clone ( ) ,
self . url . clone ( ) ,
self . description . clone ( ) ,
] ;
let stop_words = get ( LANGUAGE ::English ) ;
let punctuation = [
" . " . to_owned ( ) ,
" , " . to_owned ( ) ,
" : " . to_owned ( ) ,
" ; " . to_owned ( ) ,
" ! " . to_owned ( ) ,
" ? " . to_owned ( ) ,
" ( " . to_owned ( ) ,
" ) " . to_owned ( ) ,
" [ " . to_owned ( ) ,
" ] " . to_owned ( ) ,
" { " . to_owned ( ) ,
" } " . to_owned ( ) ,
" \" " . to_owned ( ) ,
" ' " . to_owned ( ) ,
" < " . to_owned ( ) ,
" > " . to_owned ( ) ,
] ;
self . relevance_score = calculate_tf_idf ( query , & documents , & stop_words , & punctuation ) ;
}
2023-04-27 17:53:28 +03:00
/// A function which adds the engine name provided as a string into a vector of strings.
///
/// # Arguments
///
/// * `engine` - Takes an engine name provided as a String.
2023-08-27 21:02:23 +03:00
pub fn add_engines ( & mut self , engine : & str ) {
self . engine . push ( engine . to_owned ( ) )
2023-04-25 16:30:04 +03:00
}
2023-04-26 17:46:49 +03:00
2023-04-30 19:24:16 +03:00
/// A function which returns the engine name stored from the struct as a string.
///
/// # Returns
///
/// An engine name stored as a string from the struct.
2023-08-27 21:02:23 +03:00
pub fn engine ( & mut self ) -> String {
std ::mem ::take ( & mut self . engine [ 0 ] )
2023-04-26 17:46:49 +03:00
}
2023-04-25 16:30:04 +03:00
}
2023-09-03 19:23:34 +03:00
/// A named struct that stores the error info related to the upstream search engines.
2023-08-27 21:02:23 +03:00
#[ derive(Serialize, Deserialize, Clone) ]
2023-07-14 21:26:29 +03:00
pub struct EngineErrorInfo {
2023-09-03 19:23:34 +03:00
/// It stores the error type which occured while fetching the result from a particular search
/// engine.
2023-07-14 21:26:29 +03:00
pub error : String ,
2023-09-03 19:23:34 +03:00
/// It stores the name of the engine that failed to provide the requested search results.
2023-07-14 21:26:29 +03:00
pub engine : String ,
2023-09-03 19:23:34 +03:00
/// It stores the name of the color to indicate whether how severe the particular error is (In
/// other words it indicates the severity of the error/issue).
2023-08-10 04:32:47 +03:00
pub severity_color : String ,
2023-07-14 21:26:29 +03:00
}
impl EngineErrorInfo {
2023-09-03 19:23:34 +03:00
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `error` - It takes the error type which occured while fetching the result from a particular
/// search engine.
/// * `engine` - It takes the name of the engine that failed to provide the requested search results.
2023-08-27 21:02:23 +03:00
pub fn new ( error : & EngineError , engine : & str ) -> Self {
2023-07-14 21:26:29 +03:00
Self {
error : match error {
2023-10-08 22:30:31 +02:00
EngineError ::NoSuchEngineFound ( _ ) = > " EngineNotFound " . to_owned ( ) ,
2023-08-27 21:02:23 +03:00
EngineError ::RequestError = > " RequestError " . to_owned ( ) ,
EngineError ::EmptyResultSet = > " EmptyResultSet " . to_owned ( ) ,
EngineError ::UnexpectedError = > " UnexpectedError " . to_owned ( ) ,
2023-07-14 21:26:29 +03:00
} ,
2023-08-27 21:02:23 +03:00
engine : engine . to_owned ( ) ,
2023-08-10 04:32:47 +03:00
severity_color : match error {
2023-10-08 22:30:31 +02:00
EngineError ::NoSuchEngineFound ( _ ) = > " red " . to_owned ( ) ,
2023-08-27 21:02:23 +03:00
EngineError ::RequestError = > " green " . to_owned ( ) ,
EngineError ::EmptyResultSet = > " blue " . to_owned ( ) ,
EngineError ::UnexpectedError = > " red " . to_owned ( ) ,
2023-08-10 04:32:47 +03:00
} ,
2023-07-14 21:26:29 +03:00
}
}
}
2023-05-23 09:34:46 +00:00
/// A named struct to store, serialize, deserialize the all the search results scraped and
2023-05-02 11:58:21 +03:00
/// aggregated from the upstream search engines.
2023-04-27 17:53:28 +03:00
/// `SearchResult` structs.
2023-09-11 23:20:05 +02:00
#[ derive(Serialize, Deserialize, Default, Clone) ]
2023-04-22 14:35:07 +03:00
#[ serde(rename_all = " camelCase " ) ]
pub struct SearchResults {
2023-09-03 19:23:34 +03:00
/// Stores the individual serializable `SearchResult` struct into a vector of
2023-04-22 14:35:07 +03:00
pub results : Vec < SearchResult > ,
2023-09-03 19:23:34 +03:00
/// Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
2023-07-14 21:26:29 +03:00
pub engine_errors_info : Vec < EngineErrorInfo > ,
2023-09-12 17:59:33 +03:00
/// Stores the flag option which holds the check value that the following
/// search query was disallowed when the safe search level set to 4 and it
2023-09-12 17:49:46 +03:00
/// was present in the `Blocklist` file.
2023-09-02 17:48:27 +03:00
pub disallowed : bool ,
2023-09-12 17:59:33 +03:00
/// Stores the flag option which holds the check value that the following
/// search query was filtered when the safe search level set to 3 and it
2023-09-12 17:49:46 +03:00
/// was present in the `Blocklist` file.
2023-09-02 17:48:27 +03:00
pub filtered : bool ,
2023-09-23 12:48:01 +03:00
/// Stores the safe search level `safesearch` provided in the search url.
pub safe_search_level : u8 ,
/// Stores the flag option which holds the check value that whether any search engines were
/// selected or not.
pub no_engines_selected : bool ,
2023-04-22 14:35:07 +03:00
}
2023-04-25 16:30:04 +03:00
impl SearchResults {
2023-04-27 17:53:28 +03:00
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `results` - Takes an argument of individual serializable `SearchResult` struct
/// and stores it into a vector of `SearchResult` structs.
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
/// the search url.
2023-09-12 17:59:33 +03:00
/// * `engine_errors_info` - Takes an array of structs which contains information regarding
/// which engines failed with their names, reason and their severity color name.
2023-11-18 21:38:02 +03:00
pub fn new ( results : Vec < SearchResult > , engine_errors_info : & [ EngineErrorInfo ] ) -> Self {
2023-08-27 21:02:23 +03:00
Self {
2023-04-25 16:30:04 +03:00
results ,
2023-09-10 18:56:54 +03:00
engine_errors_info : engine_errors_info . to_owned ( ) ,
2023-09-02 17:48:27 +03:00
disallowed : Default ::default ( ) ,
filtered : Default ::default ( ) ,
2023-09-23 12:48:01 +03:00
safe_search_level : Default ::default ( ) ,
no_engines_selected : Default ::default ( ) ,
2023-04-25 16:30:04 +03:00
}
}
2023-04-30 18:16:08 +03:00
2023-09-02 17:48:27 +03:00
/// A setter function that sets disallowed to true.
pub fn set_disallowed ( & mut self ) {
self . disallowed = true ;
}
/// A setter function that sets the filtered to true.
2024-03-11 12:01:30 +03:00
pub fn set_filtered ( & mut self , filtered : bool ) {
self . filtered = filtered ;
2023-09-02 17:48:27 +03:00
}
/// A getter function that gets the value of `engine_errors_info`.
pub fn engine_errors_info ( & mut self ) -> Vec < EngineErrorInfo > {
std ::mem ::take ( & mut self . engine_errors_info )
}
/// A getter function that gets the value of `results`.
pub fn results ( & mut self ) -> Vec < SearchResult > {
self . results . clone ( )
2023-04-30 18:16:08 +03:00
}
2023-09-23 12:48:01 +03:00
/// A setter function to set the current page safe search level.
pub fn set_safe_search_level ( & mut self , safe_search_level : u8 ) {
self . safe_search_level = safe_search_level ;
}
/// A getter function that gets the value of `no_engines_selected`.
pub fn no_engines_selected ( & self ) -> bool {
self . no_engines_selected
}
/// A setter function to set the `no_engines_selected` to true.
pub fn set_no_engines_selected ( & mut self ) {
self . no_engines_selected = true ;
}
2023-04-25 16:30:04 +03:00
}
2024-03-25 09:16:49 +00:00
/// Helper function to calculate the tf-idf for the search query.
/// <br> The approach is as [`as`](https://en.wikipedia.org/wiki/Tf%E2%80%93idf).
/// <br> Find a sample article about TF-IDF [`here`](https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3)
/// ### Arguments
/// * `query` - a user's search query
/// * `documents` - a list of text used for comparision (url, title, description)
/// * `stop_words` - A list of language specific stop words.
/// * `punctuation` - list of punctuation symbols.
/// ### Returns
/// * `score` - The average tf-idf score of the word tokens (and synonyms) in the query
fn calculate_tf_idf (
query : & str ,
documents : & [ String ] ,
stop_words : & [ String ] ,
punctuation : & [ String ] ,
) -> f32 {
use keyword_extraction ::{
tf_idf ::{ TfIdf , TfIdfParams } ,
tokenizer ::Tokenizer ,
} ;
let params = TfIdfParams ::UnprocessedDocuments ( documents , stop_words , Some ( punctuation ) ) ;
let tf_idf = TfIdf ::new ( params ) ;
let tokener = Tokenizer ::new ( query , stop_words , Some ( punctuation ) ) ;
let query_tokens = tokener . split_into_words ( ) ;
let mut search_tokens = vec! [ ] ;
for token in query_tokens {
#[ cfg(any(
feature = " use-synonyms-search " ,
feature = " use-non-static-synonyms-search "
) ) ]
{
// find some synonyms and add them to the search (from wordnet or moby if feature is enabled)
let synonyms = synonyms ( & token ) ;
search_tokens . extend ( synonyms )
}
search_tokens . push ( token ) ;
}
let mut total_score = 0.0 f32 ;
for token in search_tokens . iter ( ) {
total_score + = tf_idf . get_score ( token ) ;
}
let result = total_score / ( search_tokens . len ( ) as f32 ) ;
f32 ::from ( ! result . is_nan ( ) ) * result
}