crabbysearch/src/engines/duckduckgo.rs

//! The `duckduckgo` module handles the scraping of results from the duckduckgo search engine
//! by querying the upstream duckduckgo search engine with user provided query and with a page
//! number if provided.

use std::collections::HashMap;

use reqwest::header::HeaderMap;
use scraper::Html;

use crate::models::aggregation_models::SearchResult;

use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{Report, Result, ResultExt};

use super::search_result_parser::SearchResultParser;

/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct DuckDuckGo {
    parser: SearchResultParser,
}

impl DuckDuckGo {
    pub fn new() -> Result<Self, EngineError> {
        Ok(Self {
            parser: SearchResultParser::new(
                ".no-results",
                ".result",
                ".result__a",
                ".result__url",
                ".result__snippet",
            )?,
        })
    }
}

#[async_trait::async_trait]
impl SearchEngine for DuckDuckGo {
    async fn results(
        &self,
        query: &str,
        page: u32,
        user_agent: &str,
        request_timeout: u8,
        _safe_search: u8,
    ) -> Result<HashMap<String, SearchResult>, EngineError> {
        // Page number can be missing or empty string and so appropriate handling is required
        // so that upstream server recieves valid page number.
        let url: String = match page {
            1 | 0 => {
                format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
            }
            _ => {
                format!(
                    "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
                    query,
                    (page / 2 + (page % 2)) * 30,
                    (page / 2 + (page % 2)) * 30 + 1
                )
            }
        };

        // initializing HeaderMap and adding appropriate headers.
        let header_map = HeaderMap::try_from(&HashMap::from([
            ("USER_AGENT".to_string(), user_agent.to_string()),
            ("REFERER".to_string(), "https://google.com/".to_string()),
            (
                "CONTENT_TYPE".to_string(),
                "application/x-www-form-urlencoded".to_string(),
            ),
            ("COOKIE".to_string(), "kl=wt-wt".to_string()),
        ]))
        .change_context(EngineError::UnexpectedError)?;

        let document: Html = Html::parse_document(
            &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
        );

        if document.select(&self.parser.no_result).next().is_some() {
            return Err(Report::new(EngineError::EmptyResultSet));
        }

        // scrape all the results from the html
        Ok(document
            .select(&self.parser.results)
            .map(|result| {
                SearchResult::new(
                    result
                        .select(&self.parser.result_title)
                        .next()
                        .unwrap()
                        .inner_html()
                        .trim(),
                    format!(
                        "https://{}",
                        result
                            .select(&self.parser.result_url)
                            .next()
                            .unwrap()
                            .inner_html()
                            .trim()
                    )
                    .as_str(),
                    result
                        .select(&self.parser.result_desc)
                        .next()
                        .unwrap()
                        .inner_html()
                        .trim(),
                    &["duckduckgo"],
                )
            })
            .map(|search_result| (search_result.url.clone(), search_result))
            .collect())
    }
}
updating and improving README.org 2023-04-27 16:06:59 +03:00			//! The `duckduckgo` module handles the scraping of results from the duckduckgo search engine
			`//! by querying the upstream duckduckgo search engine with user provided query and with a page`
			`//! number if provided.`

✨ feat: remove the time crate import 2023-07-14 17:16:13 +03:00			`use std::collections::HashMap;`
initial commit 2023-04-22 14:35:07 +03:00
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`use reqwest::header::HeaderMap;`
Create separate search_result_parser 2023-09-24 13:54:08 +02:00			`use scraper::Html;`
initial commit 2023-04-22 14:35:07 +03:00
⚙️ refactor: reorganize code & restructure codebase for better maintainability (#207) 2023-09-03 20:50:50 +03:00			`use crate::models::aggregation_models::SearchResult;`
initial commit 2023-04-22 14:35:07 +03:00
⚙️ refactor: reorganize code & restructure codebase for better maintainability (#207) 2023-09-03 20:50:50 +03:00			`use crate::models::engine_models::{EngineError, SearchEngine};`
improve error handling by using `error-stack` crate 2023-06-14 20:42:30 +08:00
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`use error_stack::{Report, Result, ResultExt};`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 19:54:51 +03:00
Create separate search_result_parser 2023-09-24 13:54:08 +02:00			`use super::search_result_parser::SearchResultParser;`

✨ feat: add documentation to code 2023-07-15 13:36:46 +03:00			/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
			`/// reduce code duplication as well as allows to create vector of different search engines easily.`
Create separate search_result_parser 2023-09-24 13:54:08 +02:00			`pub struct DuckDuckGo {`
			`parser: SearchResultParser,`
			`}`

			`impl DuckDuckGo {`
			`pub fn new() -> Result<Self, EngineError> {`
			`Ok(Self {`
			`parser: SearchResultParser::new(`
			`".no-results",`
			`".result",`
			`".result__a",`
			`".result__url",`
			`".result__snippet",`
			`)?,`
			`})`
			`}`
			`}`
add code to evade ip blocking, improve pagination code and fix documentation 2023-05-02 11:58:21 +03:00
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`#[async_trait::async_trait]`
			`impl SearchEngine for DuckDuckGo {`
🧹 chore: make rustfmt happy (#205) 2023-09-03 19:34:22 +03:00			`async fn results(`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`&self,`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`query: &str,`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`page: u32,`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`user_agent: &str,`
✨ feat: provide the functionality to use the new config option 2023-07-30 10:53:48 +03:00			`request_timeout: u8,`
✨ feat: pass the config option into the results function (#201) 2023-09-02 17:44:05 +03:00			`_safe_search: u8,`
Improve aggregation Adds the EngineHandler struct Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server. Merged RawSearchResult and SearchResult, as they were functionally identical. 2023-08-18 10:43:53 +02:00			`) -> Result<HashMap<String, SearchResult>, EngineError> {`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`// Page number can be missing or empty string and so appropriate handling is required`
			`// so that upstream server recieves valid page number.`
			`let url: String = match page {`
🛠️ fix: add & improve code to handle page handling in searx & duckduckgo 2023-07-30 20:14:40 +03:00			`1 \| 0 => {`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")`
			`}`
			`_ => {`
			`format!(`
			`"https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",`
			`query,`
			`(page / 2 + (page % 2)) * 30,`
			`(page / 2 + (page % 2)) * 30 + 1`
			`)`
			`}`
			`};`
initial commit 2023-04-22 14:35:07 +03:00
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`// initializing HeaderMap and adding appropriate headers.`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`let header_map = HeaderMap::try_from(&HashMap::from([`
			`("USER_AGENT".to_string(), user_agent.to_string()),`
			`("REFERER".to_string(), "https://google.com/".to_string()),`
			`(`
			`"CONTENT_TYPE".to_string(),`
			`"application/x-www-form-urlencoded".to_string(),`
			`),`
			`("COOKIE".to_string(), "kl=wt-wt".to_string()),`
			`]))`
			`.change_context(EngineError::UnexpectedError)?;`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 19:54:51 +03:00
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`let document: Html = Html::parse_document(`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`&DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`);`
chore: provide a better and more standardized way to handle engine errors 2023-05-31 19:54:51 +03:00
Create separate search_result_parser 2023-09-24 13:54:08 +02:00			`if document.select(&self.parser.no_result).next().is_some() {`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`return Err(Report::new(EngineError::EmptyResultSet));`
			`}`
initial commit 2023-04-22 14:35:07 +03:00
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`// scrape all the results from the html`
			`Ok(document`
Create separate search_result_parser 2023-09-24 13:54:08 +02:00			`.select(&self.parser.results)`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`.map(\|result\| {`
Improve aggregation Adds the EngineHandler struct Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server. Merged RawSearchResult and SearchResult, as they were functionally identical. 2023-08-18 10:43:53 +02:00			`SearchResult::new(`
Refactoring code and separating code into files for better maintainability 2023-04-25 16:30:04 +03:00			`result`
Create separate search_result_parser 2023-09-24 13:54:08 +02:00			`.select(&self.parser.result_title)`
Refactoring code and separating code into files for better maintainability 2023-04-25 16:30:04 +03:00			`.next()`
			`.unwrap()`
			`.inner_html()`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`.trim(),`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`format!(`
			`"https://{}",`
			`result`
Create separate search_result_parser 2023-09-24 13:54:08 +02:00			`.select(&self.parser.result_url)`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`.next()`
			`.unwrap()`
			`.inner_html()`
			`.trim()`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`)`
			`.as_str(),`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`result`
Create separate search_result_parser 2023-09-24 13:54:08 +02:00			`.select(&self.parser.result_desc)`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`.next()`
			`.unwrap()`
			`.inner_html()`
⚙️ refactor: add several optimizations to the engine code (#180)(#178) 2023-08-27 20:59:08 +03:00			`.trim(),`
			`&["duckduckgo"],`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`)`
			`})`
Improve Aggregation function & config parser Refactor aggregation function Rename visiting_url to url, as they are always the same (see upstream engine scalping). Refactor parsing function to be more readable. 2023-08-17 22:48:20 +02:00			`.map(\|search_result\| (search_result.url.clone(), search_result))`
✨ feat: rewrite code by implementing common engine trait `SearchEngine` 2023-07-11 19:42:17 +03:00			`.collect())`
			`}`
initial commit 2023-04-22 14:35:07 +03:00			`}`