Merge branch 'rolling' into rolling
This commit is contained in:
commit
d28cbb96a1
7 changed files with 86 additions and 18 deletions
|
@ -4,7 +4,7 @@
|
|||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::{header::HeaderMap, Client};
|
||||
use scraper::Html;
|
||||
|
||||
use crate::models::aggregation_models::SearchResult;
|
||||
|
@ -42,7 +42,7 @@ impl SearchEngine for Brave {
|
|||
query: &str,
|
||||
page: u32,
|
||||
user_agent: &str,
|
||||
request_timeout: u8,
|
||||
client: &Client,
|
||||
safe_search: u8,
|
||||
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||
let url = format!("https://search.brave.com/search?q={query}&offset={page}");
|
||||
|
@ -68,7 +68,7 @@ impl SearchEngine for Brave {
|
|||
.change_context(EngineError::UnexpectedError)?;
|
||||
|
||||
let document: Html = Html::parse_document(
|
||||
&Brave::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||
&Brave::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
||||
);
|
||||
|
||||
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
use scraper::Html;
|
||||
|
||||
use crate::models::aggregation_models::SearchResult;
|
||||
|
@ -44,7 +45,7 @@ impl SearchEngine for DuckDuckGo {
|
|||
query: &str,
|
||||
page: u32,
|
||||
user_agent: &str,
|
||||
request_timeout: u8,
|
||||
client: &Client,
|
||||
_safe_search: u8,
|
||||
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||
// Page number can be missing or empty string and so appropriate handling is required
|
||||
|
@ -76,7 +77,7 @@ impl SearchEngine for DuckDuckGo {
|
|||
.change_context(EngineError::UnexpectedError)?;
|
||||
|
||||
let document: Html = Html::parse_document(
|
||||
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||
&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
||||
);
|
||||
|
||||
if self.parser.parse_for_no_results(&document).next().is_some() {
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
//! number if provided.
|
||||
|
||||
use reqwest::header::HeaderMap;
|
||||
use reqwest::Client;
|
||||
use scraper::Html;
|
||||
use std::collections::HashMap;
|
||||
|
||||
|
@ -40,7 +41,7 @@ impl SearchEngine for Searx {
|
|||
query: &str,
|
||||
page: u32,
|
||||
user_agent: &str,
|
||||
request_timeout: u8,
|
||||
client: &Client,
|
||||
mut safe_search: u8,
|
||||
) -> Result<HashMap<String, SearchResult>, EngineError> {
|
||||
// Page number can be missing or empty string and so appropriate handling is required
|
||||
|
@ -68,7 +69,7 @@ impl SearchEngine for Searx {
|
|||
.change_context(EngineError::UnexpectedError)?;
|
||||
|
||||
let document: Html = Html::parse_document(
|
||||
&Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
|
||||
&Searx::fetch_html_from_upstream(self, &url, header_map, client).await?,
|
||||
);
|
||||
|
||||
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
|
||||
|
|
|
@ -3,7 +3,8 @@
|
|||
|
||||
use super::aggregation_models::SearchResult;
|
||||
use error_stack::{Report, Result, ResultExt};
|
||||
use std::{collections::HashMap, fmt, time::Duration};
|
||||
use reqwest::Client;
|
||||
use std::{collections::HashMap, fmt};
|
||||
|
||||
/// A custom error type used for handle engine associated errors.
|
||||
#[derive(Debug)]
|
||||
|
@ -71,12 +72,11 @@ pub trait SearchEngine: Sync + Send {
|
|||
&self,
|
||||
url: &str,
|
||||
header_map: reqwest::header::HeaderMap,
|
||||
request_timeout: u8,
|
||||
client: &Client,
|
||||
) -> Result<String, EngineError> {
|
||||
// fetch the html from upstream search engine
|
||||
Ok(reqwest::Client::new()
|
||||
Ok(client
|
||||
.get(url)
|
||||
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
|
||||
.headers(header_map) // add spoofed headers to emulate human behavior
|
||||
.send()
|
||||
.await
|
||||
|
@ -109,7 +109,7 @@ pub trait SearchEngine: Sync + Send {
|
|||
query: &str,
|
||||
page: u32,
|
||||
user_agent: &str,
|
||||
request_timeout: u8,
|
||||
client: &Client,
|
||||
safe_search: u8,
|
||||
) -> Result<HashMap<String, SearchResult>, EngineError>;
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ use crate::models::{
|
|||
};
|
||||
use error_stack::Report;
|
||||
use regex::Regex;
|
||||
use reqwest::{Client, ClientBuilder};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
|
@ -18,6 +19,9 @@ use std::{
|
|||
use std::{fs::File, io::BufRead};
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
/// A constant for holding the prebuilt Client globally in the app.
|
||||
static CLIENT: std::sync::OnceLock<Client> = std::sync::OnceLock::new();
|
||||
|
||||
/// Aliases for long type annotations
|
||||
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
|
||||
|
||||
|
@ -68,6 +72,16 @@ pub async fn aggregate(
|
|||
request_timeout: u8,
|
||||
safe_search: u8,
|
||||
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
||||
let client = CLIENT.get_or_init(|| {
|
||||
ClientBuilder::new()
|
||||
.timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
|
||||
.https_only(true)
|
||||
.gzip(true)
|
||||
.brotli(true)
|
||||
.build()
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let user_agent: &str = random_user_agent();
|
||||
|
||||
// Add a random delay before making the request.
|
||||
|
@ -88,7 +102,7 @@ pub async fn aggregate(
|
|||
let query: String = query.to_owned();
|
||||
tasks.push(tokio::spawn(async move {
|
||||
search_engine
|
||||
.results(&query, page, user_agent, request_timeout, safe_search)
|
||||
.results(&query, page, user_agent, client, safe_search)
|
||||
.await
|
||||
}));
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue