diff --git a/Cargo.lock b/Cargo.lock index f6b5ae5..0cca780 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -284,6 +284,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "anes" version = "0.1.6" @@ -314,6 +329,20 @@ version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341" +[[package]] +name = "async-compression" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5" +dependencies = [ + "brotli", + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio 1.34.0", +] + [[package]] name = "async-once-cell" version = "0.5.3" @@ -412,6 +441,27 @@ dependencies = [ "generic-array", ] +[[package]] +name = "brotli" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bstr" version = "1.8.0" @@ -477,9 +527,9 @@ dependencies = [ [[package]] name = "cargo-platform" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12024c4645c97566567129c204f65d5815a8c9aecf30fcbe682b2fe034996d36" +checksum = "e34637b3140142bdf929fb439e8aa4ebad7651ebf7b1080b3930aa16ac1459ff" dependencies = [ "serde", ] @@ -2819,6 +2869,7 @@ version = "0.11.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b" dependencies = [ + "async-compression", "base64 0.21.5", "bytes 1.5.0", "encoding_rs", @@ -2844,6 +2895,7 @@ dependencies = [ "system-configuration", "tokio 1.34.0", "tokio-rustls", + "tokio-util", "tower-service", "url 2.4.1", "wasm-bindgen", @@ -2899,9 +2951,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.24" +version = "0.38.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ad981d6c340a49cdc40a1028d9c6084ec7e9fa33fcb839cab656a267071e234" +checksum = "dc99bc2d4f1fed22595588a013687477aedf3cdcfb26558c559edb67b4d9b22e" dependencies = [ "bitflags 2.4.1", "errno", @@ -3989,7 +4041,7 @@ checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" [[package]] name = "websurfx" -version = "1.2.26" +version = "1.2.27" dependencies = [ "actix-cors", "actix-files", diff --git a/Cargo.toml b/Cargo.toml index ffae799..b3f1521 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "websurfx" -version = "1.2.26" +version = "1.2.27" edition = "2021" description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind." repository = "https://github.com/neon-mmd/websurfx" @@ -13,7 +13,7 @@ bench = false path = "src/bin/websurfx.rs" [dependencies] -reqwest = {version="0.11.22", default-features=false, features=["rustls-tls"]} +reqwest = {version="0.11.22", default-features=false, features=["rustls-tls","brotli", "gzip"]} tokio = {version="1.32.0",features=["rt-multi-thread","macros"], default-features = false} serde = {version="1.0.190", default-features=false, features=["derive"]} serde_json = {version="1.0.108", default-features=false} diff --git a/src/engines/brave.rs b/src/engines/brave.rs index 5c7c126..49626e3 100644 --- a/src/engines/brave.rs +++ b/src/engines/brave.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; -use reqwest::header::HeaderMap; +use reqwest::{header::HeaderMap, Client}; use scraper::Html; use crate::models::aggregation_models::SearchResult; @@ -42,7 +42,7 @@ impl SearchEngine for Brave { query: &str, page: u32, user_agent: &str, - request_timeout: u8, + client: &Client, safe_search: u8, ) -> Result, EngineError> { let url = format!("https://search.brave.com/search?q={query}&offset={page}"); @@ -68,7 +68,7 @@ impl SearchEngine for Brave { .change_context(EngineError::UnexpectedError)?; let document: Html = Html::parse_document( - &Brave::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, + &Brave::fetch_html_from_upstream(self, &url, header_map, client).await?, ); if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) { diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index 352a33b..fadddb6 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -5,6 +5,7 @@ use std::collections::HashMap; use reqwest::header::HeaderMap; +use reqwest::Client; use scraper::Html; use crate::models::aggregation_models::SearchResult; @@ -44,7 +45,7 @@ impl SearchEngine for DuckDuckGo { query: &str, page: u32, user_agent: &str, - request_timeout: u8, + client: &Client, _safe_search: u8, ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required @@ -76,7 +77,7 @@ impl SearchEngine for DuckDuckGo { .change_context(EngineError::UnexpectedError)?; let document: Html = Html::parse_document( - &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, + &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, client).await?, ); if self.parser.parse_for_no_results(&document).next().is_some() { diff --git a/src/engines/searx.rs b/src/engines/searx.rs index 79c1e95..7bf0431 100644 --- a/src/engines/searx.rs +++ b/src/engines/searx.rs @@ -3,6 +3,7 @@ //! number if provided. use reqwest::header::HeaderMap; +use reqwest::Client; use scraper::Html; use std::collections::HashMap; @@ -40,7 +41,7 @@ impl SearchEngine for Searx { query: &str, page: u32, user_agent: &str, - request_timeout: u8, + client: &Client, mut safe_search: u8, ) -> Result, EngineError> { // Page number can be missing or empty string and so appropriate handling is required @@ -68,7 +69,7 @@ impl SearchEngine for Searx { .change_context(EngineError::UnexpectedError)?; let document: Html = Html::parse_document( - &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?, + &Searx::fetch_html_from_upstream(self, &url, header_map, client).await?, ); if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) { diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs index 98367e8..f8e966e 100644 --- a/src/models/engine_models.rs +++ b/src/models/engine_models.rs @@ -3,7 +3,8 @@ use super::aggregation_models::SearchResult; use error_stack::{Report, Result, ResultExt}; -use std::{collections::HashMap, fmt, time::Duration}; +use reqwest::Client; +use std::{collections::HashMap, fmt}; /// A custom error type used for handle engine associated errors. #[derive(Debug)] @@ -71,12 +72,11 @@ pub trait SearchEngine: Sync + Send { &self, url: &str, header_map: reqwest::header::HeaderMap, - request_timeout: u8, + client: &Client, ) -> Result { // fetch the html from upstream search engine - Ok(reqwest::Client::new() + Ok(client .get(url) - .timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server .headers(header_map) // add spoofed headers to emulate human behavior .send() .await @@ -109,7 +109,7 @@ pub trait SearchEngine: Sync + Send { query: &str, page: u32, user_agent: &str, - request_timeout: u8, + client: &Client, safe_search: u8, ) -> Result, EngineError>; } diff --git a/src/results/aggregator.rs b/src/results/aggregator.rs index f605260..5c53864 100644 --- a/src/results/aggregator.rs +++ b/src/results/aggregator.rs @@ -9,6 +9,7 @@ use crate::models::{ }; use error_stack::Report; use regex::Regex; +use reqwest::{Client, ClientBuilder}; use std::time::{SystemTime, UNIX_EPOCH}; use std::{ collections::HashMap, @@ -18,6 +19,9 @@ use std::{ use std::{fs::File, io::BufRead}; use tokio::task::JoinHandle; +/// A constant for holding the prebuilt Client globally in the app. +static CLIENT: std::sync::OnceLock = std::sync::OnceLock::new(); + /// Aliases for long type annotations type FutureVec = Vec, Report>>>; @@ -68,6 +72,16 @@ pub async fn aggregate( request_timeout: u8, safe_search: u8, ) -> Result> { + let client = CLIENT.get_or_init(|| { + ClientBuilder::new() + .timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server + .https_only(true) + .gzip(true) + .brotli(true) + .build() + .unwrap() + }); + let user_agent: &str = random_user_agent(); // Add a random delay before making the request. @@ -88,7 +102,7 @@ pub async fn aggregate( let query: String = query.to_owned(); tasks.push(tokio::spawn(async move { search_engine - .results(&query, page, user_agent, request_timeout, safe_search) + .results(&query, page, user_agent, client, safe_search) .await })); }