initial commit
This commit is contained in:
commit
15fc415301
30 changed files with 4359 additions and 0 deletions
35
src/bin/websurfx.rs
Normal file
35
src/bin/websurfx.rs
Normal file
|
@ -0,0 +1,35 @@
|
|||
use websurfx::server::routes;
|
||||
|
||||
use actix_files as fs;
|
||||
use actix_web::{web, App, HttpServer};
|
||||
use handlebars::Handlebars;
|
||||
|
||||
// The function that launches the main server and handle routing functionality
|
||||
#[actix_web::main]
|
||||
async fn main() -> std::io::Result<()> {
|
||||
let mut handlebars: Handlebars = Handlebars::new();
|
||||
|
||||
handlebars
|
||||
.register_templates_directory(".html", "./public/templates")
|
||||
.unwrap();
|
||||
|
||||
let handlebars_ref: web::Data<Handlebars> = web::Data::new(handlebars);
|
||||
|
||||
HttpServer::new(move || {
|
||||
App::new()
|
||||
.app_data(handlebars_ref.clone())
|
||||
// Serve images and static files (css and js files).
|
||||
.service(fs::Files::new("/static", "./public/static").show_files_listing())
|
||||
.service(fs::Files::new("/images", "./public/images").show_files_listing())
|
||||
.service(routes::robots_data) // robots.txt
|
||||
.service(routes::index) // index page
|
||||
.service(routes::search) // search page
|
||||
.service(routes::about) // about page
|
||||
.service(routes::settings) // settings page
|
||||
.default_service(web::route().to(routes::not_found)) // error page
|
||||
})
|
||||
// Start server on 127.0.0.1:8080
|
||||
.bind(("127.0.0.1", 8080))?
|
||||
.run()
|
||||
.await
|
||||
}
|
96
src/engines/duckduckgo.rs
Normal file
96
src/engines/duckduckgo.rs
Normal file
|
@ -0,0 +1,96 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use reqwest::header::USER_AGENT;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
||||
|
||||
// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
||||
// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
||||
// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
||||
// values are RawSearchResult struct and then returns it within a Result enum.
|
||||
pub async fn results(
|
||||
query: &str,
|
||||
page: Option<u32>,
|
||||
user_agent: &str,
|
||||
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
||||
// Page number can be missing or empty string and so appropriate handling is required
|
||||
// so that upstream server recieves valid page number.
|
||||
let url: String = match page {
|
||||
Some(page_number) => {
|
||||
if page_number <= 1 {
|
||||
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
|
||||
} else {
|
||||
format!(
|
||||
"https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
|
||||
query,
|
||||
page_number / 2 * 30,
|
||||
page_number / 2 * 30 + 1
|
||||
)
|
||||
}
|
||||
}
|
||||
None => format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js"),
|
||||
};
|
||||
|
||||
// fetch the html from upstream duckduckgo engine
|
||||
// TODO: Write better error handling code to handle no results case.
|
||||
let results: String = reqwest::Client::new()
|
||||
.get(url)
|
||||
.header(USER_AGENT, user_agent)
|
||||
.send()
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
|
||||
let document: Html = Html::parse_document(&results);
|
||||
let results: Selector = Selector::parse(".result")?;
|
||||
let result_title: Selector = Selector::parse(".result__a")?;
|
||||
let result_url: Selector = Selector::parse(".result__url")?;
|
||||
let result_desc: Selector = Selector::parse(".result__snippet")?;
|
||||
|
||||
let mut search_results: HashMap<String, RawSearchResult> = HashMap::new();
|
||||
|
||||
// scrape all the results from the html
|
||||
for result in document.select(&results) {
|
||||
let search_result: RawSearchResult = RawSearchResult {
|
||||
title: result
|
||||
.select(&result_title)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim()
|
||||
.to_string(),
|
||||
visiting_url: format!(
|
||||
"https://{}",
|
||||
result
|
||||
.select(&result_url)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim()
|
||||
),
|
||||
description: result
|
||||
.select(&result_desc)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim()
|
||||
.to_string(),
|
||||
engine: vec!["duckduckgo".to_string()],
|
||||
};
|
||||
search_results.insert(
|
||||
format!(
|
||||
"https://{}",
|
||||
result
|
||||
.select(&result_url)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim()
|
||||
),
|
||||
search_result,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(search_results)
|
||||
}
|
2
src/engines/mod.rs
Normal file
2
src/engines/mod.rs
Normal file
|
@ -0,0 +1,2 @@
|
|||
pub mod duckduckgo;
|
||||
pub mod searx;
|
89
src/engines/searx.rs
Normal file
89
src/engines/searx.rs
Normal file
|
@ -0,0 +1,89 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use reqwest::header::USER_AGENT;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
||||
|
||||
// This function scrapes results from the upstream engine searx instance and puts all the scraped
|
||||
// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
||||
// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
||||
// values are RawSearchResult struct and then returns it within a Result enum.
|
||||
pub async fn results(
|
||||
query: &str,
|
||||
page: Option<u32>,
|
||||
user_agent: &str,
|
||||
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
||||
// Page number can be missing or empty string and so appropriate handling is required
|
||||
// so that upstream server recieves valid page number.
|
||||
let url: String = match page {
|
||||
Some(page_number) => {
|
||||
if page_number <= 1 {
|
||||
format!("https://searx.work/search?q={query}")
|
||||
} else {
|
||||
format!("https://searx.work/search?q={query}&pageno={page_number}",)
|
||||
}
|
||||
}
|
||||
None => format!("https://searx.work/search?q={query}"),
|
||||
};
|
||||
|
||||
// fetch the html from upstream searx instance engine
|
||||
// TODO: Write better error handling code to handle no results case.
|
||||
let results: String = reqwest::Client::new()
|
||||
.get(url)
|
||||
.header(USER_AGENT, user_agent)
|
||||
.send()
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
|
||||
let document: Html = Html::parse_document(&results);
|
||||
let results: Selector = Selector::parse(".result")?;
|
||||
let result_title: Selector = Selector::parse("h3>a")?;
|
||||
let result_url: Selector = Selector::parse("h3>a")?;
|
||||
let result_desc: Selector = Selector::parse(".content")?;
|
||||
|
||||
let mut search_results: HashMap<String, RawSearchResult> = HashMap::new();
|
||||
|
||||
// scrape all the results from the html
|
||||
for result in document.select(&results) {
|
||||
let search_result: RawSearchResult = RawSearchResult {
|
||||
title: result
|
||||
.select(&result_title)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim()
|
||||
.to_string(),
|
||||
visiting_url: result
|
||||
.select(&result_url)
|
||||
.next()
|
||||
.unwrap()
|
||||
.value()
|
||||
.attr("href")
|
||||
.unwrap()
|
||||
.to_string(),
|
||||
description: result
|
||||
.select(&result_desc)
|
||||
.next()
|
||||
.unwrap()
|
||||
.inner_html()
|
||||
.trim()
|
||||
.to_string(),
|
||||
engine: vec!["searx".to_string()],
|
||||
};
|
||||
search_results.insert(
|
||||
result
|
||||
.select(&result_url)
|
||||
.next()
|
||||
.unwrap()
|
||||
.value()
|
||||
.attr("href")
|
||||
.unwrap()
|
||||
.to_string(),
|
||||
search_result,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(search_results)
|
||||
}
|
3
src/lib.rs
Normal file
3
src/lib.rs
Normal file
|
@ -0,0 +1,3 @@
|
|||
pub mod engines;
|
||||
pub mod server;
|
||||
pub mod search_results_handler;
|
25
src/search_results_handler/aggregation_models.rs
Normal file
25
src/search_results_handler/aggregation_models.rs
Normal file
|
@ -0,0 +1,25 @@
|
|||
use serde::Serialize;
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct SearchResult {
|
||||
pub title: String,
|
||||
pub visiting_url: String,
|
||||
pub url: String,
|
||||
pub description: String,
|
||||
pub engine: Vec<String>,
|
||||
}
|
||||
|
||||
pub struct RawSearchResult {
|
||||
pub title: String,
|
||||
pub visiting_url: String,
|
||||
pub description: String,
|
||||
pub engine: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct SearchResults {
|
||||
pub results: Vec<SearchResult>,
|
||||
pub page_query: String,
|
||||
}
|
77
src/search_results_handler/aggregator.rs
Normal file
77
src/search_results_handler/aggregator.rs
Normal file
|
@ -0,0 +1,77 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use fake_useragent::{Browsers, UserAgentsBuilder};
|
||||
|
||||
use super::aggregation_models::{RawSearchResult, SearchResult, SearchResults};
|
||||
use crate::engines::{duckduckgo, searx};
|
||||
|
||||
// A function that aggregates all the scraped results from the above upstream engines and
|
||||
// then removes duplicate results and if two results are found to be from two or more engines
|
||||
// then puts their names together to show the results are fetched from these upstream engines
|
||||
// and then removes all data from the HashMap and puts into a struct of all results aggregated
|
||||
// into a vector and also adds the query used into the struct this is neccessory because otherwise
|
||||
// the search bar in search remains empty if searched from the query url
|
||||
//
|
||||
// For Example:
|
||||
//
|
||||
// If you search from the url like *https://127.0.0.1/search?q=huston* then the search bar should
|
||||
// contain the word huston and not remain empty.
|
||||
pub async fn aggregate(
|
||||
query: &str,
|
||||
page: Option<u32>,
|
||||
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
||||
// Generate random user agent to improve privacy of the user.
|
||||
let user_agent: String = UserAgentsBuilder::new()
|
||||
.cache(false)
|
||||
.dir("/tmp")
|
||||
.thread(1)
|
||||
.set_browsers(
|
||||
Browsers::new()
|
||||
.set_chrome()
|
||||
.set_safari()
|
||||
.set_edge()
|
||||
.set_firefox()
|
||||
.set_mozilla(),
|
||||
)
|
||||
.build()
|
||||
.random()
|
||||
.to_string();
|
||||
|
||||
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
||||
|
||||
let ddg_map_results: HashMap<String, RawSearchResult> =
|
||||
duckduckgo::results(query, page, &user_agent).await?;
|
||||
let searx_map_results: HashMap<String, RawSearchResult> =
|
||||
searx::results(query, page, &user_agent).await?;
|
||||
|
||||
result_map.extend(ddg_map_results);
|
||||
|
||||
for (key, value) in searx_map_results.into_iter() {
|
||||
if result_map.contains_key(&key) {
|
||||
result_map
|
||||
.get_mut(&key)
|
||||
.unwrap()
|
||||
.engine
|
||||
.push(value.engine.get(0).unwrap().to_string())
|
||||
} else {
|
||||
result_map.insert(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
let mut search_results: Vec<SearchResult> = Vec::new();
|
||||
|
||||
for (key, value) in result_map.into_iter() {
|
||||
search_results.push(SearchResult {
|
||||
title: value.title,
|
||||
visiting_url: value.visiting_url,
|
||||
url: key,
|
||||
description: value.description,
|
||||
engine: value.engine,
|
||||
})
|
||||
}
|
||||
|
||||
Ok(SearchResults {
|
||||
results: search_results,
|
||||
page_query: query.to_string(),
|
||||
})
|
||||
}
|
2
src/search_results_handler/mod.rs
Normal file
2
src/search_results_handler/mod.rs
Normal file
|
@ -0,0 +1,2 @@
|
|||
pub mod aggregation_models;
|
||||
pub mod aggregator;
|
1
src/server/mod.rs
Normal file
1
src/server/mod.rs
Normal file
|
@ -0,0 +1 @@
|
|||
pub mod routes;
|
79
src/server/routes.rs
Normal file
79
src/server/routes.rs
Normal file
|
@ -0,0 +1,79 @@
|
|||
use std::fs::read_to_string;
|
||||
|
||||
use crate::search_results_handler::aggregator::aggregate;
|
||||
use actix_web::{get, web, HttpRequest, HttpResponse};
|
||||
use handlebars::Handlebars;
|
||||
use serde::Deserialize;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct SearchParams {
|
||||
q: Option<String>,
|
||||
page: Option<u32>,
|
||||
}
|
||||
|
||||
#[get("/")]
|
||||
pub async fn index(
|
||||
hbs: web::Data<Handlebars<'_>>,
|
||||
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
||||
let page_content: String = hbs.render("index", &"").unwrap();
|
||||
Ok(HttpResponse::Ok().body(page_content))
|
||||
}
|
||||
|
||||
pub async fn not_found(
|
||||
hbs: web::Data<Handlebars<'_>>,
|
||||
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
||||
let page_content: String = hbs.render("404", &"")?;
|
||||
|
||||
Ok(HttpResponse::Ok()
|
||||
.content_type("text/html; charset=utf-8")
|
||||
.body(page_content))
|
||||
}
|
||||
|
||||
#[get("/search")]
|
||||
pub async fn search(
|
||||
hbs: web::Data<Handlebars<'_>>,
|
||||
req: HttpRequest,
|
||||
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
||||
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
||||
match ¶ms.q {
|
||||
Some(query) => {
|
||||
if query.trim().is_empty() {
|
||||
Ok(HttpResponse::Found()
|
||||
.insert_header(("location", "/"))
|
||||
.finish())
|
||||
} else {
|
||||
let results_json: crate::search_results_handler::aggregation_models::SearchResults =
|
||||
aggregate(query, params.page).await?;
|
||||
let page_content: String = hbs.render("search", &results_json)?;
|
||||
Ok(HttpResponse::Ok().body(page_content))
|
||||
}
|
||||
}
|
||||
None => Ok(HttpResponse::Found()
|
||||
.insert_header(("location", "/"))
|
||||
.finish()),
|
||||
}
|
||||
}
|
||||
|
||||
#[get("/robots.txt")]
|
||||
pub async fn robots_data(_req: HttpRequest) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
||||
let page_content: String = read_to_string("./public/robots.txt")?;
|
||||
Ok(HttpResponse::Ok()
|
||||
.content_type("text/plain; charset=ascii")
|
||||
.body(page_content))
|
||||
}
|
||||
|
||||
#[get("/about")]
|
||||
pub async fn about(
|
||||
hbs: web::Data<Handlebars<'_>>,
|
||||
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
||||
let page_content: String = hbs.render("about", &"")?;
|
||||
Ok(HttpResponse::Ok().body(page_content))
|
||||
}
|
||||
|
||||
#[get("/settings")]
|
||||
pub async fn settings(
|
||||
hbs: web::Data<Handlebars<'_>>,
|
||||
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
||||
let page_content: String = hbs.render("settings", &"")?;
|
||||
Ok(HttpResponse::Ok().body(page_content))
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue