diff --git a/.github/workflows/pr_labeler.yml b/.github/workflows/pr_labeler.yml index bc7e72b..473e4cc 100644 --- a/.github/workflows/pr_labeler.yml +++ b/.github/workflows/pr_labeler.yml @@ -9,7 +9,7 @@ jobs: pull-requests: write runs-on: ubuntu-latest steps: - - uses: actions/labeler@v4 + - uses: actions/labeler@v5 with: sync-labels: true dot: true diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index c4d68e0..c444b5e 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -19,7 +19,7 @@ jobs: pull-requests: write steps: - - uses: actions/stale@v8 + - uses: actions/stale@v9 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-issue-message: 'Stale issue message' diff --git a/Cargo.lock b/Cargo.lock index 398eb1b..203b018 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,6 +82,7 @@ dependencies = [ "ahash 0.8.7", "base64 0.21.5", "bitflags 2.4.1", + "brotli", "bytes 1.5.0", "bytestring", "derive_more", @@ -363,9 +364,9 @@ checksum = "9338790e78aa95a416786ec8389546c4b6a1dfc3dc36071ed9518a9413a542eb" [[package]] name = "async-trait" -version = "0.1.75" +version = "0.1.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdf6721fb0140e4f897002dd086c06f6c27775df19cfe1fccb21181a48fd2c98" +checksum = "531b97fb4cd3dfdce92c35dedbfdc1f0b9d8091c8ca943d6dae340ef5012d514" dependencies = [ "proc-macro2 1.0.71", "quote 1.0.33", @@ -2028,10 +2029,11 @@ dependencies = [ [[package]] name = "minify-js" -version = "0.5.6" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22d6c512a82abddbbc13b70609cb2beff01be2c7afff534d6e5e1c85e438fc8b" +checksum = "b1fa5546ee8bd66024113e506cabe4230e76635a094c06ea2051b66021dda92e" dependencies = [ + "aho-corasick 0.7.20", "lazy_static", "parse-js", ] @@ -2328,9 +2330,9 @@ dependencies = [ [[package]] name = "parse-js" -version = "0.17.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ec3b11d443640ec35165ee8f6f0559f1c6f41878d70330fe9187012b5935f02" +checksum = "2742b5e32dcb5930447ed9f9e401a7dfd883867fc079c4fac44ae8ba3593710e" dependencies = [ "aho-corasick 0.7.20", "bumpalo", @@ -2790,9 +2792,9 @@ dependencies = [ [[package]] name = "redis" -version = "0.23.3" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f49cdc0bb3f412bf8e7d1bd90fe1d9eb10bc5c399ba90973c14662a27b3f8ba" +checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd" dependencies = [ "arc-swap", "async-trait", @@ -3183,9 +3185,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.108" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b" +checksum = "cb0652c533506ad7a2e353cce269330d6afd8bdfb6d75e0ace5b35aacbd7b9e9" dependencies = [ "itoa 1.0.10", "ryu", diff --git a/Cargo.toml b/Cargo.toml index b80fc2b..c57ac7c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,20 +16,20 @@ path = "src/bin/websurfx.rs" reqwest = {version="0.11.22", default-features=false, features=["rustls-tls","brotli", "gzip"]} tokio = {version="1.32.0",features=["rt-multi-thread","macros"], default-features = false} serde = {version="1.0.190", default-features=false, features=["derive"]} -serde_json = {version="1.0.108", default-features=false} +serde_json = {version="1.0.109", default-features=false} maud = {version="0.25.0", default-features=false, features=["actix-web"]} scraper = {version="0.18.1", default-features = false} -actix-web = {version="4.4.0", features = ["cookies", "macros"], default-features=false} +actix-web = {version="4.4.0", features = ["cookies", "macros", "compress-brotli"], default-features=false} actix-files = {version="0.6.2", default-features=false} actix-cors = {version="0.6.4", default-features=false} fake-useragent = {version="0.1.3", default-features=false} env_logger = {version="0.10.0", default-features=false} log = {version="0.4.20", default-features=false} mlua = {version="0.9.1", features=["luajit", "vendored"], default-features=false} -redis = {version="0.23.3", features=["tokio-comp","connection-manager"], default-features = false, optional = true} +redis = {version="0.24.0", features=["tokio-comp","connection-manager"], default-features = false, optional = true} blake3 = {version="1.5.0", default-features=false} error-stack = {version="0.4.0", default-features=false, features=["std"]} -async-trait = {version="0.1.73", default-features=false} +async-trait = {version="0.1.76", default-features=false} regex = {version="1.9.4", features=["perf"], default-features = false} smallvec = {version="1.11.0", features=["union", "serde"], default-features=false} futures = {version="0.3.28", default-features=false} @@ -46,7 +46,7 @@ tempfile = {version="3.8.0", default-features=false} [build-dependencies] lightningcss = {version="1.0.0-alpha.50", default-features=false, features=["grid"]} -minify-js = {version="0.5.6", default-features=false} +minify-js = {version="0.6.0", default-features=false} [profile.dev] opt-level = 0 diff --git a/Dockerfile b/Dockerfile index 83c7fb3..6795a2a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=$BUILDPLATFORM rust:1.74.0-alpine3.18 AS chef +FROM --platform=$BUILDPLATFORM rust:1.75.0-alpine3.18 AS chef # We only pay the installation cost once, # it will be cached from the second build onwards RUN apk add --no-cache alpine-sdk musl-dev g++ make libcrypto3 libressl-dev upx perl build-base diff --git a/dev.Dockerfile b/dev.Dockerfile index 098abbf..5c4334d 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -1,5 +1,5 @@ # Create Builder image -FROM --platform=$BUILDPLATFORM rust:1.74.0-alpine3.18 +FROM --platform=$BUILDPLATFORM rust:1.75.0-alpine3.18 # Install required dependencies RUN apk add --no-cache alpine-sdk musl-dev g++ make libcrypto3 libressl-dev perl build-base diff --git a/src/engines/bing.rs b/src/engines/bing.rs new file mode 100644 index 0000000..84dbf93 --- /dev/null +++ b/src/engines/bing.rs @@ -0,0 +1,124 @@ +//! The `bing` module handles the scraping of results from the bing search engine +//! by querying the upstream bing search engine with user provided query and with a page +//! number if provided. + +use std::collections::HashMap; + +use regex::Regex; +use reqwest::header::HeaderMap; +use reqwest::Client; +use scraper::Html; + +use crate::models::aggregation_models::SearchResult; + +use crate::models::engine_models::{EngineError, SearchEngine}; + +use error_stack::{Report, Result, ResultExt}; + +use super::search_result_parser::SearchResultParser; + +/// A new Bing engine type defined in-order to implement the `SearchEngine` trait which allows to +/// reduce code duplication as well as allows to create vector of different search engines easily. +pub struct Bing { + /// The parser, used to interpret the search result. + parser: SearchResultParser, +} + +impl Bing { + /// Creates the Bing parser. + pub fn new() -> Result { + Ok(Self { + parser: SearchResultParser::new( + ".b_results", + ".b_algo", + "h2 a", + ".tpcn a.tilk", + ".b_caption p", + )?, + }) + } +} + +#[async_trait::async_trait] +impl SearchEngine for Bing { + async fn results( + &self, + query: &str, + page: u32, + user_agent: &str, + client: &Client, + _safe_search: u8, + ) -> Result, EngineError> { + // Bing uses `start results from this number` convention + // So, for 10 results per page, page 0 starts at 1, page 1 + // starts at 11, and so on. + let results_per_page = 10; + let start_result = results_per_page * page + 1; + + let url: String = match page { + 0 => { + format!("https://www.bing.com/search?q={query}") + } + _ => { + format!("https://www.bing.com/search?q={query}&first={start_result}") + } + }; + + let query_params: Vec<(&str, &str)> = vec![ + ("_EDGE_V", "1"), + ("SRCHD=AF", "NOFORM"), + ("_Rwho=u", "d"), + ("bngps=s", "0"), + ("_UR=QS=0&TQS", "0"), + ("_UR=QS=0&TQS", "0"), + ]; + + let mut cookie_string = String::new(); + for (k, v) in &query_params { + cookie_string.push_str(&format!("{k}={v}; ")); + } + + let header_map = HeaderMap::try_from(&HashMap::from([ + ("USER_AGENT".to_string(), user_agent.to_string()), + ("REFERER".to_string(), "https://google.com/".to_string()), + ( + "CONTENT_TYPE".to_string(), + "application/x-www-form-urlencoded".to_string(), + ), + ("COOKIE".to_string(), cookie_string), + ])) + .change_context(EngineError::UnexpectedError)?; + + let document: Html = Html::parse_document( + &Bing::fetch_html_from_upstream(self, &url, header_map, client).await?, + ); + + // Bing is very aggressive in finding matches + // even with the most absurd of queries. ".b_algo" is the + // class for the list item of results + if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) { + if no_result_msg + .value() + .attr("class") + .map(|classes| classes.contains("b_algo")) + .unwrap_or(false) + { + return Err(Report::new(EngineError::EmptyResultSet)); + } + } + + let re_span = Regex::new(r#".*?(?: ยท|)"#).unwrap(); + let re_strong = Regex::new(r#"(|)"#).unwrap(); + + // scrape all the results from the html + self.parser + .parse_for_results(&document, |title, url, desc| { + Some(SearchResult::new( + &re_strong.replace_all(title.inner_html().trim(), ""), + url.value().attr("href").unwrap(), + &re_span.replace_all(desc.inner_html().trim(), ""), + &["bing"], + )) + }) + } +} diff --git a/src/engines/mod.rs b/src/engines/mod.rs index d56ec6f..a93c9c2 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -3,6 +3,7 @@ //! provide a standard functions to be implemented for all the upstream search engine handling //! code. Moreover, it also provides a custom error for the upstream search engine handling code. +pub mod bing; pub mod brave; pub mod duckduckgo; pub mod librex; diff --git a/src/lib.rs b/src/lib.rs index 0d8f49d..ec35273 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,12 @@ use crate::server::router; use actix_cors::Cors; use actix_files as fs; use actix_governor::{Governor, GovernorConfigBuilder}; -use actix_web::{dev::Server, http::header, middleware::Logger, web, App, HttpServer}; +use actix_web::{ + dev::Server, + http::header, + middleware::{Compress, Logger}, + web, App, HttpServer, +}; use cache::cacher::{Cacher, SharedCache}; use config::parser::Config; use handler::{file_path, FileType}; @@ -73,6 +78,8 @@ pub fn run( ]); App::new() + // Compress the responses provided by the server for the client requests. + .wrap(Compress::default()) .wrap(Logger::default()) // added logging middleware for logging. .app_data(web::Data::new(config.clone())) .app_data(cache.clone()) diff --git a/src/models/engine_models.rs b/src/models/engine_models.rs index 2a698d5..3108e6e 100644 --- a/src/models/engine_models.rs +++ b/src/models/engine_models.rs @@ -166,6 +166,10 @@ impl EngineHandler { let engine = crate::engines::mojeek::Mojeek::new()?; ("mojeek", Box::new(engine)) } + "bing" => { + let engine = crate::engines::bing::Bing::new()?; + ("bing", Box::new(engine)) + } _ => { return Err(Report::from(EngineError::NoSuchEngineFound( engine_name.to_string(), diff --git a/src/server/router.rs b/src/server/router.rs index 74b8bfa..c46e79d 100644 --- a/src/server/router.rs +++ b/src/server/router.rs @@ -6,22 +6,20 @@ use crate::{ config::parser::Config, handler::{file_path, FileType}, }; -use actix_web::{get, web, HttpRequest, HttpResponse}; +use actix_web::{get, http::header::ContentType, web, HttpRequest, HttpResponse}; use std::fs::read_to_string; /// Handles the route of index page or main page of the `websurfx` meta search engine website. #[get("/")] pub async fn index(config: web::Data) -> Result> { - Ok(HttpResponse::Ok() - .content_type("text/html; charset=utf-8") - .body( - crate::templates::views::index::index( - &config.style.colorscheme, - &config.style.theme, - &config.style.animation, - ) - .0, - )) + Ok(HttpResponse::Ok().content_type(ContentType::html()).body( + crate::templates::views::index::index( + &config.style.colorscheme, + &config.style.theme, + &config.style.animation, + ) + .0, + )) } /// Handles the route of any other accessed route/page which is not provided by the @@ -29,16 +27,14 @@ pub async fn index(config: web::Data) -> Result, ) -> Result> { - Ok(HttpResponse::Ok() - .content_type("text/html; charset=utf-8") - .body( - crate::templates::views::not_found::not_found( - &config.style.colorscheme, - &config.style.theme, - &config.style.animation, - ) - .0, - )) + Ok(HttpResponse::Ok().content_type(ContentType::html()).body( + crate::templates::views::not_found::not_found( + &config.style.colorscheme, + &config.style.theme, + &config.style.animation, + ) + .0, + )) } /// Handles the route of robots.txt page of the `websurfx` meta search engine website. @@ -47,23 +43,21 @@ pub async fn robots_data(_req: HttpRequest) -> Result) -> Result> { - Ok(HttpResponse::Ok() - .content_type("text/html; charset=utf-8") - .body( - crate::templates::views::about::about( - &config.style.colorscheme, - &config.style.theme, - &config.style.animation, - ) - .0, - )) + Ok(HttpResponse::Ok().content_type(ContentType::html()).body( + crate::templates::views::about::about( + &config.style.colorscheme, + &config.style.theme, + &config.style.animation, + ) + .0, + )) } /// Handles the route of settings page of the `websurfx` meta search engine website. @@ -71,16 +65,14 @@ pub async fn about(config: web::Data) -> Result, ) -> Result> { - Ok(HttpResponse::Ok() - .content_type("text/html; charset=utf-8") - .body( - crate::templates::views::settings::settings( - config.safe_search, - &config.style.colorscheme, - &config.style.theme, - &config.style.animation, - &config.upstream_search_engines, - )? - .0, - )) + Ok(HttpResponse::Ok().content_type(ContentType::html()).body( + crate::templates::views::settings::settings( + config.safe_search, + &config.style.colorscheme, + &config.style.theme, + &config.style.animation, + &config.upstream_search_engines, + )? + .0, + )) } diff --git a/src/server/routes/search.rs b/src/server/routes/search.rs index 35bd9ad..908875a 100644 --- a/src/server/routes/search.rs +++ b/src/server/routes/search.rs @@ -11,7 +11,7 @@ use crate::{ }, results::aggregator::aggregate, }; -use actix_web::{get, web, HttpRequest, HttpResponse}; +use actix_web::{get, http::header::ContentType, web, HttpRequest, HttpResponse}; use regex::Regex; use std::{ fs::File, @@ -68,18 +68,16 @@ pub async fn search( get_results(page + 1) ); - Ok(HttpResponse::Ok() - .content_type("text/html; charset=utf-8") - .body( - crate::templates::views::search::search( - &config.style.colorscheme, - &config.style.theme, - &config.style.animation, - query, - &results?, - ) - .0, - )) + Ok(HttpResponse::Ok().content_type(ContentType::html()).body( + crate::templates::views::search::search( + &config.style.colorscheme, + &config.style.theme, + &config.style.animation, + query, + &results?, + ) + .0, + )) } None => Ok(HttpResponse::TemporaryRedirect() .insert_header(("location", "/")) diff --git a/websurfx/config.lua b/websurfx/config.lua index ae7a6bb..ce2d609 100644 --- a/websurfx/config.lua +++ b/websurfx/config.lua @@ -65,4 +65,5 @@ upstream_search_engines = { Startpage = false, LibreX = false, Mojeek = false, + Bing = false, } -- select the upstream search engines from which the results should be fetched.