2023-09-03 20:50:50 +03:00
|
|
|
//! This module handles the search route of the search engine website.
|
2023-04-22 14:35:07 +03:00
|
|
|
|
2023-05-02 11:58:21 +03:00
|
|
|
use crate::{
|
2023-09-09 18:17:29 +02:00
|
|
|
cache::cacher::SharedCache,
|
2023-07-03 19:30:25 +02:00
|
|
|
config::parser::Config,
|
2023-11-27 07:45:28 +01:00
|
|
|
handler::{file_path, FileType},
|
2023-09-17 12:51:32 +03:00
|
|
|
models::{
|
|
|
|
aggregation_models::SearchResults,
|
2023-11-17 22:16:17 +03:00
|
|
|
engine_models::{EngineError, EngineHandler},
|
2023-09-17 12:51:32 +03:00
|
|
|
server_models::{Cookie, SearchParams},
|
|
|
|
},
|
2023-09-03 20:50:50 +03:00
|
|
|
results::aggregator::aggregate,
|
2023-05-02 11:58:21 +03:00
|
|
|
};
|
2023-04-22 14:35:07 +03:00
|
|
|
use actix_web::{get, web, HttpRequest, HttpResponse};
|
2023-09-02 17:50:06 +03:00
|
|
|
use regex::Regex;
|
2023-09-13 18:31:37 +03:00
|
|
|
use std::{
|
2023-09-17 12:51:32 +03:00
|
|
|
fs::File,
|
2023-09-13 18:31:37 +03:00
|
|
|
io::{BufRead, BufReader, Read},
|
|
|
|
};
|
2023-08-06 20:31:30 +03:00
|
|
|
use tokio::join;
|
2023-04-22 14:35:07 +03:00
|
|
|
|
2023-04-27 17:53:28 +03:00
|
|
|
/// Handles the route of search page of the `websurfx` meta search engine website and it takes
|
|
|
|
/// two search url parameters `q` and `page` where `page` parameter is optional.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```bash
|
|
|
|
/// curl "http://127.0.0.1:8080/search?q=sweden&page=1"
|
|
|
|
/// ```
|
2023-04-30 18:16:08 +03:00
|
|
|
///
|
2023-04-27 17:53:28 +03:00
|
|
|
/// Or
|
|
|
|
///
|
|
|
|
/// ```bash
|
|
|
|
/// curl "http://127.0.0.1:8080/search?q=sweden"
|
|
|
|
/// ```
|
2023-04-22 14:35:07 +03:00
|
|
|
#[get("/search")]
|
|
|
|
pub async fn search(
|
|
|
|
req: HttpRequest,
|
2023-04-30 18:16:08 +03:00
|
|
|
config: web::Data<Config>,
|
2023-09-09 18:17:29 +02:00
|
|
|
cache: web::Data<SharedCache>,
|
2023-04-22 14:35:07 +03:00
|
|
|
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
|
|
|
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
|
|
|
match ¶ms.q {
|
|
|
|
Some(query) => {
|
|
|
|
if query.trim().is_empty() {
|
2023-11-17 22:16:17 +03:00
|
|
|
return Ok(HttpResponse::TemporaryRedirect()
|
2023-04-22 14:35:07 +03:00
|
|
|
.insert_header(("location", "/"))
|
2023-07-03 19:30:25 +02:00
|
|
|
.finish());
|
2023-04-22 14:35:07 +03:00
|
|
|
}
|
2023-07-03 19:30:25 +02:00
|
|
|
|
2023-11-27 07:45:28 +01:00
|
|
|
let get_results = |page| {
|
2023-08-06 20:31:30 +03:00
|
|
|
results(
|
|
|
|
&config,
|
2023-09-09 18:17:29 +02:00
|
|
|
&cache,
|
2023-08-27 21:06:25 +03:00
|
|
|
query,
|
2023-08-06 20:31:30 +03:00
|
|
|
page,
|
|
|
|
req.clone(),
|
2023-11-27 07:45:28 +01:00
|
|
|
¶ms.safesearch,
|
2023-08-06 20:31:30 +03:00
|
|
|
)
|
2023-11-27 07:45:28 +01:00
|
|
|
};
|
|
|
|
|
2023-12-29 11:20:38 -07:00
|
|
|
// .max(1) makes sure that the page >= 0.
|
|
|
|
let page = params.page.unwrap_or(1).max(1) - 1;
|
2023-11-27 07:45:28 +01:00
|
|
|
|
|
|
|
let (_, results, _) = join!(
|
2023-12-29 11:20:38 -07:00
|
|
|
get_results(page.saturating_sub(1)),
|
2023-11-27 07:45:28 +01:00
|
|
|
get_results(page),
|
|
|
|
get_results(page + 1)
|
2023-07-03 19:30:25 +02:00
|
|
|
);
|
2023-08-06 20:31:30 +03:00
|
|
|
|
2023-12-28 12:05:21 +08:00
|
|
|
Ok(HttpResponse::Ok()
|
|
|
|
.content_type("text/html; charset=utf-8")
|
|
|
|
.body(
|
|
|
|
crate::templates::views::search::search(
|
|
|
|
&config.style.colorscheme,
|
|
|
|
&config.style.theme,
|
|
|
|
&config.style.animation,
|
|
|
|
query,
|
|
|
|
&results?,
|
|
|
|
)
|
|
|
|
.0,
|
|
|
|
))
|
2023-04-22 14:35:07 +03:00
|
|
|
}
|
2023-11-17 22:16:17 +03:00
|
|
|
None => Ok(HttpResponse::TemporaryRedirect()
|
2023-04-22 14:35:07 +03:00
|
|
|
.insert_header(("location", "/"))
|
|
|
|
.finish()),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-09-03 19:23:34 +03:00
|
|
|
/// Fetches the results for a query and page. It First checks the redis cache, if that
|
|
|
|
/// fails it gets proper results by requesting from the upstream search engines.
|
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
///
|
|
|
|
/// * `url` - It takes the url of the current page that requested the search results for a
|
|
|
|
/// particular search query.
|
|
|
|
/// * `config` - It takes a parsed config struct.
|
|
|
|
/// * `query` - It takes the page number as u32 value.
|
|
|
|
/// * `req` - It takes the `HttpRequest` struct as a value.
|
|
|
|
///
|
|
|
|
/// # Error
|
|
|
|
///
|
|
|
|
/// It returns the `SearchResults` struct if the search results could be successfully fetched from
|
|
|
|
/// the cache or from the upstream search engines otherwise it returns an appropriate error.
|
2023-07-04 15:11:30 -07:00
|
|
|
async fn results(
|
2023-07-03 19:30:25 +02:00
|
|
|
config: &Config,
|
2023-09-09 18:17:29 +02:00
|
|
|
cache: &web::Data<SharedCache>,
|
2023-08-27 21:06:25 +03:00
|
|
|
query: &str,
|
2023-07-03 19:30:25 +02:00
|
|
|
page: u32,
|
2023-07-15 19:50:31 +03:00
|
|
|
req: HttpRequest,
|
2023-09-22 19:54:26 +03:00
|
|
|
safe_search: &Option<u8>,
|
2023-07-03 19:30:25 +02:00
|
|
|
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
2023-11-28 11:47:35 +05:30
|
|
|
// eagerly parse cookie value to evaluate safe search level
|
|
|
|
let cookie_value = req.cookie("appCookie");
|
|
|
|
|
|
|
|
let cookie_value: Option<Cookie<'_>> = cookie_value
|
|
|
|
.as_ref()
|
|
|
|
.and_then(|cv| serde_json::from_str(cv.name_value().1).ok());
|
|
|
|
|
|
|
|
let safe_search_level = get_safesearch_level(
|
|
|
|
safe_search,
|
|
|
|
&cookie_value.as_ref().map(|cv| cv.safe_search_level),
|
|
|
|
config.safe_search,
|
|
|
|
);
|
|
|
|
|
2023-12-30 19:19:22 -07:00
|
|
|
let mut cache_key = format!(
|
2023-11-28 11:47:35 +05:30
|
|
|
"http://{}:{}/search?q={}&page={}&safesearch={}",
|
|
|
|
config.binding_ip, config.port, query, page, safe_search_level
|
2023-11-27 07:45:28 +01:00
|
|
|
);
|
|
|
|
|
2023-12-30 19:19:22 -07:00
|
|
|
// Modify the cache key adding each enabled search engine to the string
|
|
|
|
if let Some(cookie_value) = &cookie_value {
|
|
|
|
let mut engines: Vec<String> = cookie_value
|
|
|
|
.engines
|
|
|
|
.iter()
|
|
|
|
.map(|s| String::from(*s))
|
|
|
|
.collect::<Vec<String>>();
|
|
|
|
|
|
|
|
// We sort the list of engine so the cache keys will match between users. The cookie's list of engines is unordered.
|
|
|
|
engines.sort();
|
|
|
|
cache_key = cache_key + &(engines.join(""));
|
2023-12-30 22:12:43 -07:00
|
|
|
} else {
|
|
|
|
let mut engines: Vec<String> = config
|
|
|
|
.upstream_search_engines
|
|
|
|
.iter()
|
|
|
|
.filter(|map| *map.1)
|
|
|
|
.map(|map| String::from(&(*map.0)))
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
engines.sort();
|
|
|
|
cache_key = cache_key + &(engines.join(""));
|
2023-12-30 19:19:22 -07:00
|
|
|
}
|
|
|
|
|
2023-07-03 19:30:25 +02:00
|
|
|
// fetch the cached results json.
|
2023-11-28 11:47:35 +05:30
|
|
|
let cached_results = cache.cached_results(&cache_key).await;
|
2023-07-17 10:50:15 +03:00
|
|
|
// check if fetched cache results was indeed fetched or it was an error and if so
|
2023-07-03 19:30:25 +02:00
|
|
|
// handle the data accordingly.
|
2023-09-11 23:20:05 +02:00
|
|
|
match cached_results {
|
2023-11-18 21:51:21 +03:00
|
|
|
Ok(results) => Ok(results),
|
2023-09-11 23:20:05 +02:00
|
|
|
Err(_) => {
|
2023-09-22 19:54:26 +03:00
|
|
|
if safe_search_level == 4 {
|
2023-09-02 17:50:06 +03:00
|
|
|
let mut results: SearchResults = SearchResults::default();
|
|
|
|
|
2023-11-27 07:45:28 +01:00
|
|
|
let flag: bool =
|
|
|
|
!is_match_from_filter_list(file_path(FileType::BlockList)?, query)?;
|
2023-11-28 11:47:35 +05:30
|
|
|
// Return early when query contains disallowed words,
|
2023-11-27 07:45:28 +01:00
|
|
|
if flag {
|
2023-09-02 17:50:06 +03:00
|
|
|
results.set_disallowed();
|
2023-11-28 11:47:35 +05:30
|
|
|
cache.cache_results(&results, &cache_key).await?;
|
2023-09-22 19:54:26 +03:00
|
|
|
results.set_safe_search_level(safe_search_level);
|
2023-09-02 17:50:06 +03:00
|
|
|
return Ok(results);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-15 19:50:31 +03:00
|
|
|
// check if the cookie value is empty or not if it is empty then use the
|
|
|
|
// default selected upstream search engines from the config file otherwise
|
|
|
|
// parse the non-empty cookie and grab the user selected engines from the
|
|
|
|
// UI and use that.
|
2023-11-28 11:47:35 +05:30
|
|
|
let mut results: SearchResults = match cookie_value {
|
2023-07-15 19:50:31 +03:00
|
|
|
Some(cookie_value) => {
|
2023-08-27 21:06:25 +03:00
|
|
|
let engines: Vec<EngineHandler> = cookie_value
|
2023-08-18 10:43:53 +02:00
|
|
|
.engines
|
|
|
|
.iter()
|
2023-09-24 13:54:08 +02:00
|
|
|
.filter_map(|name| EngineHandler::new(name).ok())
|
2023-08-18 10:43:53 +02:00
|
|
|
.collect();
|
|
|
|
|
2023-09-23 12:50:36 +03:00
|
|
|
match engines.is_empty() {
|
|
|
|
false => {
|
|
|
|
aggregate(
|
|
|
|
query,
|
|
|
|
page,
|
|
|
|
config.aggregator.random_delay,
|
|
|
|
config.debug,
|
|
|
|
&engines,
|
|
|
|
config.request_timeout,
|
2023-09-23 15:03:48 +03:00
|
|
|
safe_search_level,
|
2023-09-23 12:50:36 +03:00
|
|
|
)
|
|
|
|
.await?
|
|
|
|
}
|
|
|
|
true => {
|
|
|
|
let mut search_results = SearchResults::default();
|
|
|
|
search_results.set_no_engines_selected();
|
|
|
|
search_results
|
|
|
|
}
|
|
|
|
}
|
2023-07-15 19:50:31 +03:00
|
|
|
}
|
2023-11-17 22:16:17 +03:00
|
|
|
None => aggregate(
|
|
|
|
query,
|
|
|
|
page,
|
|
|
|
config.aggregator.random_delay,
|
|
|
|
config.debug,
|
|
|
|
&config
|
|
|
|
.upstream_search_engines
|
|
|
|
.clone()
|
|
|
|
.into_iter()
|
|
|
|
.filter_map(|(key, value)| value.then_some(key))
|
|
|
|
.map(|engine| EngineHandler::new(&engine))
|
|
|
|
.collect::<Result<Vec<EngineHandler>, error_stack::Report<EngineError>>>(
|
|
|
|
)?,
|
|
|
|
config.request_timeout,
|
|
|
|
safe_search_level,
|
|
|
|
)
|
|
|
|
.await?,
|
2023-07-15 19:50:31 +03:00
|
|
|
};
|
2023-09-23 12:50:36 +03:00
|
|
|
if results.engine_errors_info().is_empty()
|
|
|
|
&& results.results().is_empty()
|
|
|
|
&& !results.no_engines_selected()
|
|
|
|
{
|
2023-09-02 17:50:06 +03:00
|
|
|
results.set_filtered();
|
|
|
|
}
|
2023-11-28 11:47:35 +05:30
|
|
|
cache.cache_results(&results, &cache_key).await?;
|
2023-09-22 19:54:26 +03:00
|
|
|
results.set_safe_search_level(safe_search_level);
|
2023-07-17 13:17:24 +03:00
|
|
|
Ok(results)
|
2023-07-03 19:30:25 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-09-12 18:53:32 +03:00
|
|
|
/// A helper function which checks whether the search query contains any keywords which should be
|
|
|
|
/// disallowed/allowed based on the regex based rules present in the blocklist and allowlist files.
|
2023-09-17 12:51:32 +03:00
|
|
|
///
|
|
|
|
/// # Arguments
|
|
|
|
///
|
|
|
|
/// * `file_path` - It takes the file path of the list as the argument.
|
|
|
|
/// * `query` - It takes the search query to be checked against the list as an argument.
|
|
|
|
///
|
|
|
|
/// # Error
|
|
|
|
///
|
|
|
|
/// Returns a bool indicating whether the results were found in the list or not on success
|
|
|
|
/// otherwise returns a standard error type on a failure.
|
2023-09-02 17:50:06 +03:00
|
|
|
fn is_match_from_filter_list(
|
|
|
|
file_path: &str,
|
|
|
|
query: &str,
|
|
|
|
) -> Result<bool, Box<dyn std::error::Error>> {
|
|
|
|
let mut reader = BufReader::new(File::open(file_path)?);
|
|
|
|
for line in reader.by_ref().lines() {
|
|
|
|
let re = Regex::new(&line?)?;
|
|
|
|
if re.is_match(query) {
|
2023-11-27 07:45:28 +01:00
|
|
|
return Ok(true);
|
2023-09-02 17:50:06 +03:00
|
|
|
}
|
|
|
|
}
|
2023-11-27 07:45:28 +01:00
|
|
|
|
|
|
|
Ok(false)
|
2023-09-02 17:50:06 +03:00
|
|
|
}
|
2023-11-28 11:47:35 +05:30
|
|
|
|
|
|
|
/// A helper function which returns the safe search level based on the url params
|
|
|
|
/// and cookie value.
|
|
|
|
///
|
|
|
|
/// # Argurments
|
|
|
|
///
|
|
|
|
/// * `safe_search` - Safe search level from the url.
|
|
|
|
/// * `cookie` - User's cookie
|
|
|
|
/// * `default` - Safe search level to fall back to
|
|
|
|
fn get_safesearch_level(safe_search: &Option<u8>, cookie: &Option<u8>, default: u8) -> u8 {
|
|
|
|
match safe_search {
|
|
|
|
Some(ss) => {
|
|
|
|
if *ss >= 3 {
|
|
|
|
default
|
|
|
|
} else {
|
|
|
|
*ss
|
|
|
|
}
|
|
|
|
}
|
|
|
|
None => cookie.unwrap_or(default),
|
|
|
|
}
|
|
|
|
}
|