initial commit

2023-04-22 14:35:07 +03:00 · 2023-04-22 14:35:07 +03:00 · 15fc415301
commit 15fc415301
30 changed files with 4359 additions and 0 deletions
--- a/src/bin/websurfx.rs
+++ b/src/bin/websurfx.rs
@ -0,0 +1,35 @@
+use websurfx::server::routes;
+
+use actix_files as fs;
+use actix_web::{web, App, HttpServer};
+use handlebars::Handlebars;
+
+// The function that launches the main server and handle routing functionality
+#[actix_web::main]
+async fn main() -> std::io::Result<()> {
+    let mut handlebars: Handlebars = Handlebars::new();
+
+    handlebars
+        .register_templates_directory(".html", "./public/templates")
+        .unwrap();
+
+    let handlebars_ref: web::Data<Handlebars> = web::Data::new(handlebars);
+
+    HttpServer::new(move || {
+        App::new()
+            .app_data(handlebars_ref.clone())
+            // Serve images and static files (css and js files).
+            .service(fs::Files::new("/static", "./public/static").show_files_listing())
+            .service(fs::Files::new("/images", "./public/images").show_files_listing())
+            .service(routes::robots_data) // robots.txt
+            .service(routes::index) // index page
+            .service(routes::search) // search page
+            .service(routes::about) // about page
+            .service(routes::settings) // settings page
+            .default_service(web::route().to(routes::not_found)) // error page
+    })
+    // Start server on 127.0.0.1:8080
+    .bind(("127.0.0.1", 8080))?
+    .run()
+    .await
+}
--- a/src/engines/duckduckgo.rs
+++ b/src/engines/duckduckgo.rs
@ -0,0 +1,96 @@
+use std::collections::HashMap;
+
+use reqwest::header::USER_AGENT;
+use scraper::{Html, Selector};
+
+use crate::search_results_handler::aggregation_models::RawSearchResult;
+
+// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
+// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
+// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
+// values are RawSearchResult struct and then returns it within a Result enum.
+pub async fn results(
+    query: &str,
+    page: Option<u32>,
+    user_agent: &str,
+) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
+    // Page number can be missing or empty string and so appropriate handling is required
+    // so that upstream server recieves valid page number.
+    let url: String = match page {
+        Some(page_number) => {
+            if page_number <= 1 {
+                format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
+            } else {
+                format!(
+                    "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
+                    query,
+                    page_number / 2 * 30,
+                    page_number / 2 * 30 + 1
+                )
+            }
+        }
+        None => format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js"),
+    };
+
+    // fetch the html from upstream duckduckgo engine
+    // TODO: Write better error handling code to handle no results case.
+    let results: String = reqwest::Client::new()
+        .get(url)
+        .header(USER_AGENT, user_agent)
+        .send()
+        .await?
+        .text()
+        .await?;
+
+    let document: Html = Html::parse_document(&results);
+    let results: Selector = Selector::parse(".result")?;
+    let result_title: Selector = Selector::parse(".result__a")?;
+    let result_url: Selector = Selector::parse(".result__url")?;
+    let result_desc: Selector = Selector::parse(".result__snippet")?;
+
+    let mut search_results: HashMap<String, RawSearchResult> = HashMap::new();
+
+    // scrape all the results from the html
+    for result in document.select(&results) {
+        let search_result: RawSearchResult = RawSearchResult {
+            title: result
+                .select(&result_title)
+                .next()
+                .unwrap()
+                .inner_html()
+                .trim()
+                .to_string(),
+            visiting_url: format!(
+                "https://{}",
+                result
+                    .select(&result_url)
+                    .next()
+                    .unwrap()
+                    .inner_html()
+                    .trim()
+            ),
+            description: result
+                .select(&result_desc)
+                .next()
+                .unwrap()
+                .inner_html()
+                .trim()
+                .to_string(),
+            engine: vec!["duckduckgo".to_string()],
+        };
+        search_results.insert(
+            format!(
+                "https://{}",
+                result
+                    .select(&result_url)
+                    .next()
+                    .unwrap()
+                    .inner_html()
+                    .trim()
+            ),
+            search_result,
+        );
+    }
+
+    Ok(search_results)
+}
--- a/src/engines/mod.rs
+++ b/src/engines/mod.rs
@ -0,0 +1,2 @@
+pub mod duckduckgo;
+pub mod searx;
--- a/src/engines/searx.rs
+++ b/src/engines/searx.rs
@ -0,0 +1,89 @@
+use std::collections::HashMap;
+
+use reqwest::header::USER_AGENT;
+use scraper::{Html, Selector};
+
+use crate::search_results_handler::aggregation_models::RawSearchResult;
+
+// This function scrapes results from the upstream engine searx instance and puts all the scraped
+// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
+// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
+// values are RawSearchResult struct and then returns it within a Result enum.
+pub async fn results(
+    query: &str,
+    page: Option<u32>,
+    user_agent: &str,
+) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
+    // Page number can be missing or empty string and so appropriate handling is required
+    // so that upstream server recieves valid page number.
+    let url: String = match page {
+        Some(page_number) => {
+            if page_number <= 1 {
+                format!("https://searx.work/search?q={query}")
+            } else {
+                format!("https://searx.work/search?q={query}&pageno={page_number}",)
+            }
+        }
+        None => format!("https://searx.work/search?q={query}"),
+    };
+
+    // fetch the html from upstream searx instance engine
+    // TODO: Write better error handling code to handle no results case.
+    let results: String = reqwest::Client::new()
+        .get(url)
+        .header(USER_AGENT, user_agent)
+        .send()
+        .await?
+        .text()
+        .await?;
+
+    let document: Html = Html::parse_document(&results);
+    let results: Selector = Selector::parse(".result")?;
+    let result_title: Selector = Selector::parse("h3>a")?;
+    let result_url: Selector = Selector::parse("h3>a")?;
+    let result_desc: Selector = Selector::parse(".content")?;
+
+    let mut search_results: HashMap<String, RawSearchResult> = HashMap::new();
+
+    // scrape all the results from the html
+    for result in document.select(&results) {
+        let search_result: RawSearchResult = RawSearchResult {
+            title: result
+                .select(&result_title)
+                .next()
+                .unwrap()
+                .inner_html()
+                .trim()
+                .to_string(),
+            visiting_url: result
+                .select(&result_url)
+                .next()
+                .unwrap()
+                .value()
+                .attr("href")
+                .unwrap()
+                .to_string(),
+            description: result
+                .select(&result_desc)
+                .next()
+                .unwrap()
+                .inner_html()
+                .trim()
+                .to_string(),
+            engine: vec!["searx".to_string()],
+        };
+        search_results.insert(
+            result
+                .select(&result_url)
+                .next()
+                .unwrap()
+                .value()
+                .attr("href")
+                .unwrap()
+                .to_string(),
+            search_result,
+        );
+    }
+
+    Ok(search_results)
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,3 @@
+pub mod engines;
+pub mod server;
+pub mod search_results_handler;
--- a/src/search_results_handler/aggregation_models.rs
+++ b/src/search_results_handler/aggregation_models.rs
@ -0,0 +1,25 @@
+use serde::Serialize;
+
+#[derive(Debug, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchResult {
+    pub title: String,
+    pub visiting_url: String,
+    pub url: String,
+    pub description: String,
+    pub engine: Vec<String>,
+}
+
+pub struct RawSearchResult {
+    pub title: String,
+    pub visiting_url: String,
+    pub description: String,
+    pub engine: Vec<String>,
+}
+
+#[derive(Debug, Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchResults {
+    pub results: Vec<SearchResult>,
+    pub page_query: String,
+}
--- a/src/search_results_handler/aggregator.rs
+++ b/src/search_results_handler/aggregator.rs
@ -0,0 +1,77 @@
+use std::collections::HashMap;
+
+use fake_useragent::{Browsers, UserAgentsBuilder};
+
+use super::aggregation_models::{RawSearchResult, SearchResult, SearchResults};
+use crate::engines::{duckduckgo, searx};
+
+// A function that aggregates all the scraped results from the above upstream engines and
+// then removes duplicate results and if two results are found to be from two or more engines
+// then puts their names together to show the results are fetched from these upstream engines
+// and then removes all data from the HashMap and puts into a struct of all results aggregated
+// into a vector and also adds the query used into the struct this is neccessory because otherwise
+// the search bar in search remains empty if searched from the query url
+//
+// For Example:
+//
+// If you search from the url like *https://127.0.0.1/search?q=huston* then the search bar should
+// contain the word huston and not remain empty.
+pub async fn aggregate(
+    query: &str,
+    page: Option<u32>,
+) -> Result<SearchResults, Box<dyn std::error::Error>> {
+    // Generate random user agent to improve privacy of the user.
+    let user_agent: String = UserAgentsBuilder::new()
+        .cache(false)
+        .dir("/tmp")
+        .thread(1)
+        .set_browsers(
+            Browsers::new()
+                .set_chrome()
+                .set_safari()
+                .set_edge()
+                .set_firefox()
+                .set_mozilla(),
+        )
+        .build()
+        .random()
+        .to_string();
+
+    let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
+
+    let ddg_map_results: HashMap<String, RawSearchResult> =
+        duckduckgo::results(query, page, &user_agent).await?;
+    let searx_map_results: HashMap<String, RawSearchResult> =
+        searx::results(query, page, &user_agent).await?;
+
+    result_map.extend(ddg_map_results);
+
+    for (key, value) in searx_map_results.into_iter() {
+        if result_map.contains_key(&key) {
+            result_map
+                .get_mut(&key)
+                .unwrap()
+                .engine
+                .push(value.engine.get(0).unwrap().to_string())
+        } else {
+            result_map.insert(key, value);
+        }
+    }
+
+    let mut search_results: Vec<SearchResult> = Vec::new();
+
+    for (key, value) in result_map.into_iter() {
+        search_results.push(SearchResult {
+            title: value.title,
+            visiting_url: value.visiting_url,
+            url: key,
+            description: value.description,
+            engine: value.engine,
+        })
+    }
+
+    Ok(SearchResults {
+        results: search_results,
+        page_query: query.to_string(),
+    })
+}
--- a/src/search_results_handler/mod.rs
+++ b/src/search_results_handler/mod.rs
@ -0,0 +1,2 @@
+pub mod aggregation_models;
+pub mod aggregator;
--- a/src/server/mod.rs
+++ b/src/server/mod.rs
@ -0,0 +1 @@
+pub mod routes;
--- a/src/server/routes.rs
+++ b/src/server/routes.rs
@ -0,0 +1,79 @@
+use std::fs::read_to_string;
+
+use crate::search_results_handler::aggregator::aggregate;
+use actix_web::{get, web, HttpRequest, HttpResponse};
+use handlebars::Handlebars;
+use serde::Deserialize;
+
+#[derive(Debug, Deserialize)]
+struct SearchParams {
+    q: Option<String>,
+    page: Option<u32>,
+}
+
+#[get("/")]
+pub async fn index(
+    hbs: web::Data<Handlebars<'_>>,
+) -> Result<HttpResponse, Box<dyn std::error::Error>> {
+    let page_content: String = hbs.render("index", &"").unwrap();
+    Ok(HttpResponse::Ok().body(page_content))
+}
+
+pub async fn not_found(
+    hbs: web::Data<Handlebars<'_>>,
+) -> Result<HttpResponse, Box<dyn std::error::Error>> {
+    let page_content: String = hbs.render("404", &"")?;
+
+    Ok(HttpResponse::Ok()
+        .content_type("text/html; charset=utf-8")
+        .body(page_content))
+}
+
+#[get("/search")]
+pub async fn search(
+    hbs: web::Data<Handlebars<'_>>,
+    req: HttpRequest,
+) -> Result<HttpResponse, Box<dyn std::error::Error>> {
+    let params = web::Query::<SearchParams>::from_query(req.query_string())?;
+    match &params.q {
+        Some(query) => {
+            if query.trim().is_empty() {
+                Ok(HttpResponse::Found()
+                    .insert_header(("location", "/"))
+                    .finish())
+            } else {
+                let results_json: crate::search_results_handler::aggregation_models::SearchResults =
+                    aggregate(query, params.page).await?;
+                let page_content: String = hbs.render("search", &results_json)?;
+                Ok(HttpResponse::Ok().body(page_content))
+            }
+        }
+        None => Ok(HttpResponse::Found()
+            .insert_header(("location", "/"))
+            .finish()),
+    }
+}
+
+#[get("/robots.txt")]
+pub async fn robots_data(_req: HttpRequest) -> Result<HttpResponse, Box<dyn std::error::Error>> {
+    let page_content: String = read_to_string("./public/robots.txt")?;
+    Ok(HttpResponse::Ok()
+        .content_type("text/plain; charset=ascii")
+        .body(page_content))
+}
+
+#[get("/about")]
+pub async fn about(
+    hbs: web::Data<Handlebars<'_>>,
+) -> Result<HttpResponse, Box<dyn std::error::Error>> {
+    let page_content: String = hbs.render("about", &"")?;
+    Ok(HttpResponse::Ok().body(page_content))
+}
+
+#[get("/settings")]
+pub async fn settings(
+    hbs: web::Data<Handlebars<'_>>,
+) -> Result<HttpResponse, Box<dyn std::error::Error>> {
+    let page_content: String = hbs.render("settings", &"")?;
+    Ok(HttpResponse::Ok().body(page_content))
+}