Merge branch 'rolling' into feat-error-box-for-engine-errors

This commit is contained in:
zhou fan 2023-08-24 08:02:03 +08:00 committed by GitHub
commit 575a7f95ba
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 344 additions and 349 deletions

View file

@ -15,7 +15,7 @@ use websurfx::{config::parser::Config, run};
#[actix_web::main]
async fn main() -> std::io::Result<()> {
// Initialize the parsed config file.
let config = Config::parse(true).unwrap();
let config = Config::parse(false).unwrap();
log::info!(
"started server on port {} and IP {}",

View file

@ -34,7 +34,7 @@ pub struct Config {
pub aggregator: AggregatorConfig,
pub logging: bool,
pub debug: bool,
pub upstream_search_engines: Vec<String>,
pub upstream_search_engines: Vec<crate::engines::engine_models::EngineHandler>,
pub request_timeout: u8,
pub threads: u8,
}
@ -57,7 +57,7 @@ impl Config {
/// # Arguments
///
/// * `logging_initialized` - It takes a boolean which ensures that the logging doesn't get
/// initialized twice.
/// initialized twice. Pass false if the logger has not yet been initialized.
///
/// # Error
///
@ -77,22 +77,8 @@ impl Config {
let debug: bool = globals.get::<_, bool>("debug")?;
let logging:bool= globals.get::<_, bool>("logging")?;
// Check whether logging has not been initialized before.
if logging_initialized {
if let Ok(pkg_env_var) = std::env::var("PKG_ENV"){
if pkg_env_var.to_lowercase() == "dev" {
env_logger::Builder::new().filter(None, LevelFilter::Trace).init();
}
} else {
// Initializing logging middleware with level set to default or info.
let mut log_level: LevelFilter = LevelFilter::Error;
if logging && debug == false {
log_level = LevelFilter::Info;
} else if debug {
log_level = LevelFilter::Debug;
};
env_logger::Builder::new().filter(None, log_level).init();
}
if !logging_initialized {
set_logging_level(debug, logging);
}
let threads: u8 = if parsed_threads == 0 {
@ -121,12 +107,14 @@ impl Config {
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
.into_iter()
.filter_map(|(key, value)| value.then_some(key))
.filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
.collect(),
request_timeout: globals.get::<_, u8>("request_timeout")?,
threads,
})
})
}
/// A helper function which returns an appropriate config file path checking if the config
/// file exists on that path.
///
@ -173,3 +161,25 @@ impl Config {
Err("Config file not found!!".to_string().into())
}
}
/// a helper function that sets the proper logging level
fn set_logging_level(debug: bool, logging: bool) {
if let Ok(pkg_env_var) = std::env::var("PKG_ENV") {
if pkg_env_var.to_lowercase() == "dev" {
env_logger::Builder::new()
.filter(None, LevelFilter::Trace)
.init();
return;
}
}
// Initializing logging middleware with level set to default or info.
let log_level = match (debug, logging) {
(true, true) => LevelFilter::Debug,
(true, false) => LevelFilter::Debug,
(false, true) => LevelFilter::Info,
(false, false) => LevelFilter::Error,
};
env_logger::Builder::new().filter(None, log_level).init();
}

View file

@ -7,7 +7,7 @@ use std::collections::HashMap;
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use scraper::{Html, Selector};
use crate::results::aggregation_models::RawSearchResult;
use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine};
@ -43,7 +43,7 @@ impl SearchEngine for DuckDuckGo {
page: u32,
user_agent: String,
request_timeout: u8,
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
let url: String = match page {
@ -120,7 +120,7 @@ impl SearchEngine for DuckDuckGo {
Ok(document
.select(&results)
.map(|result| {
RawSearchResult::new(
SearchResult::new(
result
.select(&result_title)
.next()
@ -147,7 +147,7 @@ impl SearchEngine for DuckDuckGo {
vec!["duckduckgo".to_string()],
)
})
.map(|search_result| (search_result.visiting_url.clone(), search_result))
.map(|search_result| (search_result.url.clone(), search_result))
.collect())
}
}

View file

@ -1,7 +1,7 @@
//! This module provides the error enum to handle different errors associated while requesting data from
//! the upstream search engines with the search query provided by the user.
use crate::results::aggregation_models::RawSearchResult;
use crate::results::aggregation_models::SearchResult;
use error_stack::{IntoReport, Result, ResultExt};
use std::{collections::HashMap, fmt, time::Duration};
@ -43,9 +43,9 @@ impl fmt::Display for EngineError {
impl error_stack::Context for EngineError {}
/// A trait to define common behaviour for all search engines.
/// A trait to define common behavior for all search engines.
#[async_trait::async_trait]
pub trait SearchEngine {
pub trait SearchEngine: Sync + Send {
async fn fetch_html_from_upstream(
&self,
url: String,
@ -56,7 +56,7 @@ pub trait SearchEngine {
Ok(reqwest::Client::new()
.get(url)
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
.headers(header_map) // add spoofed headers to emulate human behaviour
.headers(header_map) // add spoofed headers to emulate human behavior
.send()
.await
.into_report()
@ -73,5 +73,37 @@ pub trait SearchEngine {
page: u32,
user_agent: String,
request_timeout: u8,
) -> Result<HashMap<String, RawSearchResult>, EngineError>;
) -> Result<HashMap<String, SearchResult>, EngineError>;
}
pub struct EngineHandler {
engine: Box<dyn SearchEngine>,
name: &'static str,
}
impl Clone for EngineHandler {
fn clone(&self) -> Self {
Self::new(self.name).unwrap()
}
}
impl EngineHandler {
/// parses an engine name into an engine handler, returns none if the engine is unknown
pub fn new(engine_name: &str) -> Option<Self> {
let engine: (&'static str, Box<dyn SearchEngine>) =
match engine_name.to_lowercase().as_str() {
"duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)),
"searx" => ("searx", Box::new(super::searx::Searx)),
_ => return None,
};
Some(Self {
engine: engine.1,
name: engine.0,
})
}
pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
(self.name, self.engine)
}
}

View file

@ -6,7 +6,7 @@ use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use scraper::{Html, Selector};
use std::collections::HashMap;
use crate::results::aggregation_models::RawSearchResult;
use crate::results::aggregation_models::SearchResult;
use super::engine_models::{EngineError, SearchEngine};
use error_stack::{IntoReport, Report, Result, ResultExt};
@ -42,7 +42,7 @@ impl SearchEngine for Searx {
page: u32,
user_agent: String,
request_timeout: u8,
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
) -> Result<HashMap<String, SearchResult>, EngineError> {
// Page number can be missing or empty string and so appropriate handling is required
// so that upstream server recieves valid page number.
let url: String = match page {
@ -111,7 +111,7 @@ impl SearchEngine for Searx {
Ok(document
.select(&results)
.map(|result| {
RawSearchResult::new(
SearchResult::new(
result
.select(&result_title)
.next()
@ -137,7 +137,7 @@ impl SearchEngine for Searx {
vec!["searx".to_string()],
)
})
.map(|search_result| (search_result.visiting_url.clone(), search_result))
.map(|search_result| (search_result.url.clone(), search_result))
.collect())
}
}

View file

@ -5,55 +5,6 @@ use serde::{Deserialize, Serialize};
use crate::{config::parser_models::Style, engines::engine_models::EngineError};
/// A named struct to store, serialize and deserializes the individual search result from all the
/// scraped and aggregated search results from the upstream search engines.
///
/// # Fields
///
/// * `title` - The title of the search result.
/// * `visiting_url` - The url which is accessed when clicked on it (href url in html in simple
/// words).
/// * `url` - The url to be displayed below the search result title in html.
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
#[derive(Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub title: String,
pub visiting_url: String,
pub url: String,
pub description: String,
pub engine: Vec<String>,
}
impl SearchResult {
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `title` - The title of the search result.
/// * `visiting_url` - The url which is accessed when clicked on it
/// (href url in html in simple words).
/// * `url` - The url to be displayed below the search result title in html.
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
pub fn new(
title: String,
visiting_url: String,
url: String,
description: String,
engine: Vec<String>,
) -> Self {
SearchResult {
title,
visiting_url,
url,
description,
engine,
}
}
}
/// A named struct to store the raw scraped search results scraped search results from the
/// upstream search engines before aggregating it.It derives the Clone trait which is needed
/// to write idiomatic rust using `Iterators`.
@ -61,37 +12,33 @@ impl SearchResult {
/// # Fields
///
/// * `title` - The title of the search result.
/// * `visiting_url` - The url which is accessed when clicked on it
/// * `url` - The url which is accessed when clicked on it
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
#[derive(Clone)]
pub struct RawSearchResult {
#[derive(Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub title: String,
pub visiting_url: String,
pub url: String,
pub description: String,
pub engine: Vec<String>,
}
impl RawSearchResult {
impl SearchResult {
/// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
///
/// # Arguments
///
/// * `title` - The title of the search result.
/// * `visiting_url` - The url which is accessed when clicked on it
/// * `url` - The url which is accessed when clicked on it
/// (href url in html in simple words).
/// * `description` - The description of the search result.
/// * `engine` - The names of the upstream engines from which this results were provided.
pub fn new(
title: String,
visiting_url: String,
description: String,
engine: Vec<String>,
) -> Self {
RawSearchResult {
pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
SearchResult {
title,
visiting_url,
url,
description,
engine,
}

View file

@ -8,18 +8,14 @@ use rand::Rng;
use tokio::task::JoinHandle;
use super::{
aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
user_agent::random_user_agent,
};
use crate::engines::{
duckduckgo,
engine_models::{EngineError, SearchEngine},
searx,
};
use crate::engines::engine_models::{EngineError, EngineHandler};
/// Aliases for long type annotations
type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
/// The function aggregates the scraped results from the user-selected upstream search engines.
/// These engines can be chosen either from the user interface (UI) or from the configuration file.
@ -64,139 +60,93 @@ pub async fn aggregate(
page: u32,
random_delay: bool,
debug: bool,
upstream_search_engines: Vec<String>,
upstream_search_engines: Vec<EngineHandler>,
request_timeout: u8,
) -> Result<SearchResults, Box<dyn std::error::Error>> {
let user_agent: String = random_user_agent();
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
// Add a random delay before making the request.
if random_delay || !debug {
let mut rng = rand::thread_rng();
let delay_secs = rng.gen_range(1..10);
std::thread::sleep(Duration::from_secs(delay_secs));
tokio::time::sleep(Duration::from_secs(delay_secs)).await;
}
// fetch results from upstream search engines simultaneously/concurrently.
let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
.iter()
.map(|engine| match engine.to_lowercase().as_str() {
"duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
"searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
&_ => panic!("Config Error: Incorrect config file option provided"),
})
.collect();
let mut names: Vec<&str> = vec![];
let task_capacity: usize = search_engines.len();
// create tasks for upstream result fetching
let mut tasks: FutureVec = FutureVec::new();
let tasks: FutureVec = search_engines
.into_iter()
.map(|search_engine| {
let query: String = query.clone();
let user_agent: String = user_agent.clone();
tokio::spawn(async move {
search_engine
.results(query, page, user_agent.clone(), request_timeout)
.await
})
})
.collect();
for engine_handler in upstream_search_engines {
let (name, search_engine) = engine_handler.into_name_engine();
names.push(name);
let query: String = query.clone();
let user_agent: String = user_agent.clone();
tasks.push(tokio::spawn(async move {
search_engine
.results(query, page, user_agent.clone(), request_timeout)
.await
}));
}
let mut outputs = Vec::with_capacity(task_capacity);
// get upstream responses
let mut responses = Vec::with_capacity(tasks.len());
for task in tasks {
if let Ok(result) = task.await {
outputs.push(result)
responses.push(result)
}
}
// aggregate search results, removing duplicates and handling errors the upstream engines returned
let mut result_map: HashMap<String, SearchResult> = HashMap::new();
let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
// The code block `outputs.iter()` determines whether it is the first time the code is being run.
// It does this by checking the initial flag. If it is the first time, the code selects the first
// engine from which results are fetched and adds or extends them into the `result_map`. If the
// initially selected engine fails, the code automatically selects another engine to map or extend
// into the `result_map`. On the other hand, if an engine selected for the first time successfully
// fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently,
// the code iterates through the remaining engines one by one. It compares the fetched results from each
// engine with the results already present in the `result_map` to identify any duplicates. If duplicate
// results are found, the code groups them together with the name of the engine from which they were
// fetched, and automatically removes the duplicate results from the newly fetched data.
//
// Additionally, the code handles errors returned by the engines. It keeps track of which engines
// encountered errors and stores this information in a vector of structures called `EngineErrorInfo`.
// Each structure in this vector contains the name of the engine and the type of error it returned.
// These structures will later be added to the final `SearchResults` structure. The `SearchResults`
// structure is used to display an error box in the UI containing the relevant information from
// the `EngineErrorInfo` structure.
//
// In summary, this code block manages the selection of engines, handling of duplicate results, and tracking
// of errors in order to populate the `result_map` and provide informative feedback to the user through the
// `SearchResults` structure.
let mut initial: bool = true;
let mut counter: usize = 0;
outputs.iter().for_each(|results| {
if initial {
match results {
Ok(result) => {
result_map.extend(result.clone());
counter += 1;
initial = false
let mut handle_error = |error: Report<EngineError>, engine_name: String| {
log::error!("Engine Error: {:?}", error);
engine_errors_info.push(EngineErrorInfo::new(
error.downcast_ref::<EngineError>().unwrap(),
engine_name.to_string(),
));
};
for _ in 0..responses.len() {
let response = responses.pop().unwrap();
let engine = names.pop().unwrap().to_string();
if result_map.is_empty() {
match response {
Ok(results) => {
result_map = results.clone();
}
Err(error_type) => {
log::error!("Engine Error: {:?}", error_type);
engine_errors_info.push(EngineErrorInfo::new(
error_type.downcast_ref::<EngineError>().unwrap(),
upstream_search_engines[counter].clone(),
));
counter += 1
Err(error) => {
handle_error(error, engine);
}
}
} else {
match results {
Ok(result) => {
result.clone().into_iter().for_each(|(key, value)| {
result_map
.entry(key)
.and_modify(|result| {
result.add_engines(value.clone().engine());
})
.or_insert_with(|| -> RawSearchResult {
RawSearchResult::new(
value.title.clone(),
value.visiting_url.clone(),
value.description.clone(),
value.engine.clone(),
)
});
});
counter += 1
}
Err(error_type) => {
log::error!("Engine Error: {:?}", error_type);
engine_errors_info.push(EngineErrorInfo::new(
error_type.downcast_ref::<EngineError>().unwrap(),
upstream_search_engines[counter].clone(),
));
counter += 1
}
continue;
}
match response {
Ok(result) => {
result.into_iter().for_each(|(key, value)| {
result_map
.entry(key)
.and_modify(|result| {
result.add_engines(engine.clone());
})
.or_insert_with(|| -> SearchResult { value });
});
}
Err(error) => {
handle_error(error, engine);
}
}
});
}
let results = result_map.into_values().collect();
Ok(SearchResults::new(
result_map
.into_iter()
.map(|(key, value)| {
SearchResult::new(
value.title,
value.visiting_url,
key,
value.description,
value.engine,
)
})
.collect(),
results,
query.to_string(),
engine_errors_info,
))

View file

@ -7,12 +7,14 @@ use std::fs::read_to_string;
use crate::{
cache::cacher::RedisCache,
config::parser::Config,
engines::engine_models::EngineHandler,
handler::public_paths::public_path,
results::{aggregation_models::SearchResults, aggregator::aggregate},
};
use actix_web::{get, web, HttpRequest, HttpResponse};
use handlebars::Handlebars;
use serde::Deserialize;
use tokio::join;
/// A named struct which deserializes all the user provided search parameters and stores them.
///
@ -96,15 +98,49 @@ pub async fn search(
}
let page = match &params.page {
Some(page) => *page,
None => 0,
None => 1,
};
let url = format!(
"http://{}:{}/search?q={}&page={}",
config.binding_ip, config.port, query, page
let (_, results, _) = join!(
results(
format!(
"http://{}:{}/search?q={}&page={}",
config.binding_ip,
config.port,
query,
page - 1
),
&config,
query.to_string(),
page - 1,
req.clone(),
),
results(
format!(
"http://{}:{}/search?q={}&page={}",
config.binding_ip, config.port, query, page
),
&config,
query.to_string(),
page,
req.clone(),
),
results(
format!(
"http://{}:{}/search?q={}&page={}",
config.binding_ip,
config.port,
query,
page + 1
),
&config,
query.to_string(),
page + 1,
req.clone(),
)
);
let results_json = results(url, &config, query.to_string(), page, req).await?;
let page_content: String = hbs.render("search", &results_json)?;
let page_content: String = hbs.render("search", &results?)?;
Ok(HttpResponse::Ok().body(page_content))
}
None => Ok(HttpResponse::Found()
@ -140,12 +176,19 @@ async fn results(
{
Some(cookie_value) => {
let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
let engines = cookie_value
.engines
.iter()
.filter_map(|name| EngineHandler::new(name))
.collect();
aggregate(
query,
page,
config.aggregator.random_delay,
config.debug,
cookie_value.engines,
engines,
config.request_timeout,
)
.await?