crabbysearch/src/engines/brave.rs

96 lines
2.9 KiB
Rust
Raw Normal View History

//! The `brave` module handles the scraping of results from the brave search engine
//! by querying the upstream brave search engine with user provided query and with a page
//! number if provided.
use std::collections::HashMap;
use reqwest::{header::HeaderMap, Client};
use scraper::Html;
use crate::models::aggregation_models::SearchResult;
use error_stack::{Report, Result, ResultExt};
use crate::models::engine_models::{EngineError, SearchEngine};
use super::search_result_parser::SearchResultParser;
/// Scrapes the results from the Brave search engine.
pub struct Brave {
/// Utilises generic logic for parsing search results.
parser: SearchResultParser,
}
impl Brave {
/// Creates the Brave parser.
pub fn new() -> Result<Brave, EngineError> {
Ok(Self {
parser: SearchResultParser::new(
"#results h4",
"#results [data-pos]",
"a > .url",
"a",
".snippet-description",
)?,
})
}
}
#[async_trait::async_trait]
impl SearchEngine for Brave {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
safe_search: u8,
:zap: perf: several optimizations for improving the performance of the engine (#540) * :recycle: refactor: initialize & store the config & cache structs as a constant (#486) - initializes & stores the config & cache structs as a static constant. - Pass the config & cache structs as a static reference to all the functions handling their respective route. * :zap: perf: replace hashmaps with vectors for fetching & aggregating results (#486) - replace hashmaps with vectors for fetching, collecting & aggregating results as it tends to be contigous & cache efficient data structure. - refactor & redesign algorithms for fetching & aggregating results centered around vectors in aggregate function. * :heavy_plus_sign: build: add the future crate (#486) * :zap: perf: use `futureunordered` for collecting results fetched from the tokio spawn tasks (#486) - using the `futureunordered` instead of vector for collecting results reduces the time it takes to fetch the results as the results do not need to come in specific order so any result that gets fetched first gets collected in the `futureunordered` type. Co-authored-by: Spencerjibz <spencernajib2@gmail.com> * :zap: perf: initialize new async connections parallely using tokio spawn tasks (#486) * :zap: perf: initialize redis pipeline struct once with the default size of 3 (#486) * :zap: perf: reduce branch predictions by reducing conditional code branches (#486) * :white_check_mark: test(unit): provide unit test for the `get_safesearch_level` function (#486) * :zap: perf: reduce clones & use index based loop to improve search results filtering performance (#486) * 🚨 fix(clippy): make clippy/format checks happy (#486) * 🚨 fix(build): make the cargo build check happy (#486) * :zap: perf: reduce the amount of clones, to_owneds & to_strings (#486) * :zap: perf: use async crates & methods & make functions async (#486) * :bookmark: chore(release): bump the app version (#486) --------- Co-authored-by: Spencerjibz <spencernajib2@gmail.com>
2024-03-11 12:01:30 +03:00
) -> Result<Vec<(String, SearchResult)>, EngineError> {
let url = format!("https://search.brave.com/search?q={query}&offset={page}");
let safe_search_level = match safe_search {
0 => "off",
1 => "moderate",
_ => "strict",
};
let header_map = HeaderMap::try_from(&HashMap::from([
("USER_AGENT".to_string(), user_agent.to_string()),
(
"CONTENT_TYPE".to_string(),
"application/x-www-form-urlencoded".to_string(),
),
("REFERER".to_string(), "https://google.com/".to_string()),
(
"COOKIE".to_string(),
format!("safe_search={safe_search_level}"),
),
]))
.change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
&Brave::fetch_html_from_upstream(self, &url, header_map, client).await?,
);
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
if no_result_msg
.inner_html()
.contains("Not many great matches came back for your search")
{
return Err(Report::new(EngineError::EmptyResultSet));
}
}
self.parser
.parse_for_results(&document, |title, url, desc| {
url.value().attr("href").map(|url| {
SearchResult::new(
title.text().collect::<Vec<_>>().join("").trim(),
url.trim(),
desc.inner_html().trim(),
&["brave"],
)
})
})
}
}