crabbysearch/src/engines/bing.rs

124 lines
4 KiB
Rust
Raw Normal View History

//! The `bing` module handles the scraping of results from the bing search engine
//! by querying the upstream bing search engine with user provided query and with a page
//! number if provided.
use std::collections::HashMap;
use regex::Regex;
use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};
use super::search_result_parser::SearchResultParser;
/// A new Bing engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Bing {
/// The parser, used to interpret the search result.
parser: SearchResultParser,
}
impl Bing {
/// Creates the Bing parser.
pub fn new() -> Result<Self, EngineError> {
Ok(Self {
parser: SearchResultParser::new(
".b_results",
".b_algo",
"h2 a",
".tpcn a.tilk",
".b_caption p",
)?,
})
}
}
#[async_trait::async_trait]
impl SearchEngine for Bing {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
:zap: perf: several optimizations for improving the performance of the engine (#540) * :recycle: refactor: initialize & store the config & cache structs as a constant (#486) - initializes & stores the config & cache structs as a static constant. - Pass the config & cache structs as a static reference to all the functions handling their respective route. * :zap: perf: replace hashmaps with vectors for fetching & aggregating results (#486) - replace hashmaps with vectors for fetching, collecting & aggregating results as it tends to be contigous & cache efficient data structure. - refactor & redesign algorithms for fetching & aggregating results centered around vectors in aggregate function. * :heavy_plus_sign: build: add the future crate (#486) * :zap: perf: use `futureunordered` for collecting results fetched from the tokio spawn tasks (#486) - using the `futureunordered` instead of vector for collecting results reduces the time it takes to fetch the results as the results do not need to come in specific order so any result that gets fetched first gets collected in the `futureunordered` type. Co-authored-by: Spencerjibz <spencernajib2@gmail.com> * :zap: perf: initialize new async connections parallely using tokio spawn tasks (#486) * :zap: perf: initialize redis pipeline struct once with the default size of 3 (#486) * :zap: perf: reduce branch predictions by reducing conditional code branches (#486) * :white_check_mark: test(unit): provide unit test for the `get_safesearch_level` function (#486) * :zap: perf: reduce clones & use index based loop to improve search results filtering performance (#486) * 🚨 fix(clippy): make clippy/format checks happy (#486) * 🚨 fix(build): make the cargo build check happy (#486) * :zap: perf: reduce the amount of clones, to_owneds & to_strings (#486) * :zap: perf: use async crates & methods & make functions async (#486) * :bookmark: chore(release): bump the app version (#486) --------- Co-authored-by: Spencerjibz <spencernajib2@gmail.com>
2024-03-11 12:01:30 +03:00
) -> Result<Vec<(String, SearchResult)>, EngineError> {
// Bing uses `start results from this number` convention
// So, for 10 results per page, page 0 starts at 1, page 1
// starts at 11, and so on.
let results_per_page = 10;
let start_result = results_per_page * page + 1;
let url: String = match page {
0 => {
format!("https://www.bing.com/search?q={query}")
}
_ => {
format!("https://www.bing.com/search?q={query}&first={start_result}")
}
};
let query_params: Vec<(&str, &str)> = vec![
("_EDGE_V", "1"),
("SRCHD=AF", "NOFORM"),
("_Rwho=u", "d"),
("bngps=s", "0"),
("_UR=QS=0&TQS", "0"),
("_UR=QS=0&TQS", "0"),
];
let mut cookie_string = String::new();
for (k, v) in &query_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
let header_map = HeaderMap::try_from(&HashMap::from([
("User-Agent".to_string(), user_agent.to_string()),
("Referer".to_string(), "https://google.com/".to_string()),
(
"Content-Type".to_string(),
"application/x-www-form-urlencoded".to_string(),
),
("Cookie".to_string(), cookie_string),
]))
.change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
&Bing::fetch_html_from_upstream(self, &url, header_map, client).await?,
);
// Bing is very aggressive in finding matches
// even with the most absurd of queries. ".b_algo" is the
// class for the list item of results
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
if no_result_msg
.value()
.attr("class")
.map(|classes| classes.contains("b_algo"))
.unwrap_or(false)
{
return Err(Report::new(EngineError::EmptyResultSet));
}
}
let re_span = Regex::new(r#"<span.*?>.*?(?:</span>&nbsp;·|</span>)"#).unwrap();
let re_strong = Regex::new(r#"(<strong>|</strong>)"#).unwrap();
// scrape all the results from the html
self.parser
.parse_for_results(&document, |title, url, desc| {
Some(SearchResult::new(
&re_strong.replace_all(title.inner_html().trim(), ""),
url.value().attr("href").unwrap(),
&re_span.replace_all(desc.inner_html().trim(), ""),
&["bing"],
))
})
}
}