Merge branch 'rolling' into change-document-style-with-linter-warnings

This commit is contained in:
neon_arch 2023-09-12 17:49:46 +03:00 committed by GitHub
commit fb231de416
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
26 changed files with 1116 additions and 486 deletions

View file

@ -2,6 +2,7 @@
//! data scraped from the upstream search engines.
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use crate::{config::parser_models::Style, engines::engine_models::EngineError};
@ -19,7 +20,7 @@ pub struct SearchResult {
/// The description of the search result.
pub description: String,
/// The names of the upstream engines from which this results were provided.
pub engine: Vec<String>,
pub engine: SmallVec<[String; 0]>,
}
impl SearchResult {
@ -32,12 +33,12 @@ impl SearchResult {
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
pub fn new(title: &str, url: &str, description: &str, engine: &[&str]) -> Self {
SearchResult {
title,
url,
description,
engine,
title: title.to_owned(),
url: url.to_owned(),
description: description.to_owned(),
engine: engine.iter().map(|name| name.to_string()).collect(),
}
}
@ -46,8 +47,8 @@ impl SearchResult {
/// # Arguments
///
/// * `engine` - Takes an engine name provided as a String.
pub fn add_engines(&mut self, engine: String) {
self.engine.push(engine)
pub fn add_engines(&mut self, engine: &str) {
self.engine.push(engine.to_owned())
}
/// A function which returns the engine name stored from the struct as a string.
@ -55,13 +56,13 @@ impl SearchResult {
/// # Returns
///
/// An engine name stored as a string from the struct.
pub fn engine(self) -> String {
self.engine.get(0).unwrap().to_string()
pub fn engine(&mut self) -> String {
std::mem::take(&mut self.engine[0])
}
}
/// A named struct that stores the error info related to the upstream search engines.
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub struct EngineErrorInfo {
/// It stores the error type which occured while fetching the result from a particular search
/// engine.
@ -81,18 +82,18 @@ impl EngineErrorInfo {
/// * `error` - It takes the error type which occured while fetching the result from a particular
/// search engine.
/// * `engine` - It takes the name of the engine that failed to provide the requested search results.
pub fn new(error: &EngineError, engine: String) -> Self {
pub fn new(error: &EngineError, engine: &str) -> Self {
Self {
error: match error {
EngineError::RequestError => String::from("RequestError"),
EngineError::EmptyResultSet => String::from("EmptyResultSet"),
EngineError::UnexpectedError => String::from("UnexpectedError"),
EngineError::RequestError => "RequestError".to_owned(),
EngineError::EmptyResultSet => "EmptyResultSet".to_owned(),
EngineError::UnexpectedError => "UnexpectedError".to_owned(),
},
engine,
engine: engine.to_owned(),
severity_color: match error {
EngineError::RequestError => String::from("green"),
EngineError::EmptyResultSet => String::from("blue"),
EngineError::UnexpectedError => String::from("red"),
EngineError::RequestError => "green".to_owned(),
EngineError::EmptyResultSet => "blue".to_owned(),
EngineError::UnexpectedError => "red".to_owned(),
},
}
}
@ -101,7 +102,7 @@ impl EngineErrorInfo {
/// A named struct to store, serialize, deserialize the all the search results scraped and
/// aggregated from the upstream search engines.
/// `SearchResult` structs.
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct SearchResults {
/// Stores the individual serializable `SearchResult` struct into a vector of
@ -113,6 +114,14 @@ pub struct SearchResults {
/// Stores the information on which engines failed with their engine name
/// and the type of error that caused it.
pub engine_errors_info: Vec<EngineErrorInfo>,
/// Stores the flag option which holds the check value that the following
/// search query was disallowed when the safe search level set to 4 and it
/// was present in the `Blocklist` file.
pub disallowed: bool,
/// Stores the flag option which holds the check value that the following
/// search query was filtered when the safe search level set to 3 and it
/// was present in the `Blocklist` file.
pub filtered: bool,
}
impl SearchResults {
@ -126,21 +135,48 @@ impl SearchResults {
/// the search url.
/// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
/// given search query.
/// * ``
pub fn new(
results: Vec<SearchResult>,
page_query: String,
engine_errors_info: Vec<EngineErrorInfo>,
page_query: &str,
engine_errors_info: &[EngineErrorInfo],
) -> Self {
SearchResults {
Self {
results,
page_query,
style: Style::new("".to_string(), "".to_string()),
engine_errors_info,
page_query: page_query.to_owned(),
style: Style::default(),
engine_errors_info: engine_errors_info.to_owned(),
disallowed: Default::default(),
filtered: Default::default(),
}
}
/// A setter function to add website style to the return search results.
pub fn add_style(&mut self, style: Style) {
self.style = style;
pub fn add_style(&mut self, style: &Style) {
self.style = style.clone();
}
/// A setter function that sets disallowed to true.
pub fn set_disallowed(&mut self) {
self.disallowed = true;
}
/// A setter function to set the current page search query.
pub fn set_page_query(&mut self, page: &str) {
self.page_query = page.to_owned();
}
/// A setter function that sets the filtered to true.
pub fn set_filtered(&mut self) {
self.filtered = true;
}
/// A getter function that gets the value of `engine_errors_info`.
pub fn engine_errors_info(&mut self) -> Vec<EngineErrorInfo> {
std::mem::take(&mut self.engine_errors_info)
}
/// A getter function that gets the value of `results`.
pub fn results(&mut self) -> Vec<SearchResult> {
self.results.clone()
}
}

View file

@ -64,14 +64,15 @@ type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<Eng
/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
/// containing appropriate values.
pub async fn aggregate(
query: String,
query: &str,
page: u32,
random_delay: bool,
debug: bool,
upstream_search_engines: Vec<EngineHandler>,
upstream_search_engines: &[EngineHandler],
request_timeout: u8,
safe_search: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
let user_agent: String = random_user_agent();
let user_agent: &str = random_user_agent();
// Add a random delay before making the request.
if random_delay || !debug {
@ -80,19 +81,24 @@ pub async fn aggregate(
tokio::time::sleep(Duration::from_secs(delay_secs)).await;
}
let mut names: Vec<&str> = vec![];
let mut names: Vec<&str> = Vec::with_capacity(0);
// create tasks for upstream result fetching
let mut tasks: FutureVec = FutureVec::new();
for engine_handler in upstream_search_engines {
let (name, search_engine) = engine_handler.into_name_engine();
let (name, search_engine) = engine_handler.to_owned().into_name_engine();
names.push(name);
let query: String = query.clone();
let user_agent: String = user_agent.clone();
let query: String = query.to_owned();
tasks.push(tokio::spawn(async move {
search_engine
.results(query, page, user_agent.clone(), request_timeout)
.results(
&query,
page,
user_agent.clone(),
request_timeout,
safe_search,
)
.await
}));
}
@ -110,7 +116,7 @@ pub async fn aggregate(
let mut result_map: HashMap<String, SearchResult> = HashMap::new();
let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
let mut handle_error = |error: Report<EngineError>, engine_name: String| {
let mut handle_error = |error: &Report<EngineError>, engine_name: &'static str| {
log::error!("Engine Error: {:?}", error);
engine_errors_info.push(EngineErrorInfo::new(
error.downcast_ref::<EngineError>().unwrap(),
@ -120,7 +126,7 @@ pub async fn aggregate(
for _ in 0..responses.len() {
let response = responses.pop().unwrap();
let engine = names.pop().unwrap().to_string();
let engine = names.pop().unwrap();
if result_map.is_empty() {
match response {
@ -128,7 +134,7 @@ pub async fn aggregate(
result_map = results.clone();
}
Err(error) => {
handle_error(error, engine);
handle_error(&error, engine);
}
}
continue;
@ -140,39 +146,37 @@ pub async fn aggregate(
result_map
.entry(key)
.and_modify(|result| {
result.add_engines(engine.clone());
result.add_engines(engine);
})
.or_insert_with(|| -> SearchResult { value });
});
}
Err(error) => {
handle_error(error, engine);
handle_error(&error, engine);
}
}
}
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
filter_with_lists(
&mut result_map,
&mut blacklist_map,
&file_path(FileType::BlockList)?,
)?;
if safe_search >= 3 {
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new();
filter_with_lists(
&mut result_map,
&mut blacklist_map,
file_path(FileType::BlockList)?,
)?;
filter_with_lists(
&mut blacklist_map,
&mut result_map,
&file_path(FileType::AllowList)?,
)?;
filter_with_lists(
&mut blacklist_map,
&mut result_map,
file_path(FileType::AllowList)?,
)?;
drop(blacklist_map);
drop(blacklist_map);
}
let results: Vec<SearchResult> = result_map.into_values().collect();
Ok(SearchResults::new(
results,
query.to_string(),
engine_errors_info,
))
Ok(SearchResults::new(results, query, &engine_errors_info))
}
/// Filters a map of search results using a list of regex patterns.
@ -194,7 +198,7 @@ pub fn filter_with_lists(
let mut reader = BufReader::new(File::open(file_path)?);
for line in reader.by_ref().lines() {
let re = Regex::new(&line?)?;
let re = Regex::new(line?.trim())?;
// Iterate over each search result in the map and check if it matches the regex pattern
for (url, search_result) in map_to_be_filtered.clone().into_iter() {
@ -203,7 +207,10 @@ pub fn filter_with_lists(
|| re.is_match(&search_result.description.to_lowercase())
{
// If the search result matches the regex pattern, move it from the original map to the resultant map
resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap());
resultant_map.insert(
url.to_owned(),
map_to_be_filtered.remove(&url.to_owned()).unwrap(),
);
}
}
}
@ -214,6 +221,7 @@ pub fn filter_with_lists(
#[cfg(test)]
mod tests {
use super::*;
use smallvec::smallvec;
use std::collections::HashMap;
use std::io::Write;
use tempfile::NamedTempFile;
@ -223,22 +231,22 @@ mod tests {
// Create a map of search results to filter
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
"https://www.example.com".to_string(),
"https://www.example.com".to_owned(),
SearchResult {
title: "Example Domain".to_string(),
url: "https://www.example.com".to_string(),
title: "Example Domain".to_owned(),
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_string(),
engine: vec!["Google".to_string(), "Bing".to_string()],
.to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);
map_to_be_filtered.insert(
"https://www.rust-lang.org/".to_string(),
"https://www.rust-lang.org/".to_owned(),
SearchResult {
title: "Rust Programming Language".to_string(),
url: "https://www.rust-lang.org/".to_string(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(),
engine: vec!["Google".to_string(), "DuckDuckGo".to_string()],
title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
},
);
@ -267,22 +275,22 @@ mod tests {
fn test_filter_with_lists_wildcard() -> Result<(), Box<dyn std::error::Error>> {
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
"https://www.example.com".to_string(),
"https://www.example.com".to_owned(),
SearchResult {
title: "Example Domain".to_string(),
url: "https://www.example.com".to_string(),
title: "Example Domain".to_owned(),
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_string(),
engine: vec!["Google".to_string(), "Bing".to_string()],
.to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);
map_to_be_filtered.insert(
"https://www.rust-lang.org/".to_string(),
"https://www.rust-lang.org/".to_owned(),
SearchResult {
title: "Rust Programming Language".to_string(),
url: "https://www.rust-lang.org/".to_string(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(),
engine: vec!["Google".to_string(), "DuckDuckGo".to_string()],
title: "Rust Programming Language".to_owned(),
url: "https://www.rust-lang.org/".to_owned(),
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_owned(),
engine: smallvec!["Google".to_owned(), "DuckDuckGo".to_owned()],
},
);
@ -327,13 +335,13 @@ mod tests {
fn test_filter_with_lists_invalid_regex() {
let mut map_to_be_filtered = HashMap::new();
map_to_be_filtered.insert(
"https://www.example.com".to_string(),
"https://www.example.com".to_owned(),
SearchResult {
title: "Example Domain".to_string(),
url: "https://www.example.com".to_string(),
title: "Example Domain".to_owned(),
url: "https://www.example.com".to_owned(),
description: "This domain is for use in illustrative examples in documents."
.to_string(),
engine: vec!["Google".to_string(), "Bing".to_string()],
.to_owned(),
engine: smallvec!["Google".to_owned(), "Bing".to_owned()],
},
);

View file

@ -1,30 +1,34 @@
//! This module provides the functionality to generate random user agent string.
use std::sync::OnceLock;
use fake_useragent::{Browsers, UserAgents, UserAgentsBuilder};
/// A static variable which stores the initially build `UserAgents` struct. So as it can be resused
/// again and again without the need of reinitializing the `UserAgents` struct.
static USER_AGENTS: once_cell::sync::Lazy<UserAgents> = once_cell::sync::Lazy::new(|| {
UserAgentsBuilder::new()
.cache(false)
.dir("/tmp")
.thread(1)
.set_browsers(
Browsers::new()
.set_chrome()
.set_safari()
.set_edge()
.set_firefox()
.set_mozilla(),
)
.build()
});
static USER_AGENTS: OnceLock<UserAgents> = OnceLock::new();
/// A function to generate random user agent to improve privacy of the user.
///
/// # Returns
///
/// A randomly generated user agent string.
pub fn random_user_agent() -> String {
USER_AGENTS.random().to_string()
pub fn random_user_agent() -> &'static str {
USER_AGENTS
.get_or_init(|| {
UserAgentsBuilder::new()
.cache(false)
.dir("/tmp")
.thread(1)
.set_browsers(
Browsers::new()
.set_chrome()
.set_safari()
.set_edge()
.set_firefox()
.set_mozilla(),
)
.build()
})
.random()
}