From 128b66f4a4fb6bf4f09818428352581b2036c612 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Tue, 27 Aug 2024 15:52:11 -0400 Subject: [PATCH] feat(page): add with_return_page_links configuration [#8] --- Cargo.lock | 14 +++++++------- Cargo.toml | 2 +- book/src/website.md | 14 ++++++++++++++ src/npage.rs | 29 +++++++++++++++++++---------- src/page.rs | 8 +++++++- src/website.rs | 11 +++++++++++ 6 files changed, 59 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94f4ba7..0fdf7f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2370,9 +2370,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f" dependencies = [ "bitflags 2.6.0", "errno", @@ -2427,9 +2427,9 @@ checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" -version = "0.102.6" +version = "0.102.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" +checksum = "84678086bd54edf2b415183ed7a94d0efb049f1b646a33e22a36f3794be6ae56" dependencies = [ "ring", "rustls-pki-types", @@ -2694,9 +2694,9 @@ dependencies = [ [[package]] name = "spider" -version = "2.0.17" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cbe3e048eb876764dffa17de8532edbeb66ceec9110f6db7441d923c58086a" +checksum = "6cc111d3f26cbceb70d37fabab58ed4730de66b0d2144dc67805dcdee318d6f0" dependencies = [ "ahash", "async-openai", @@ -2739,7 +2739,7 @@ dependencies = [ [[package]] name = "spider_rs" -version = "0.0.48" +version = "0.0.49" dependencies = [ "indexmap", "num_cpus", diff --git a/Cargo.toml b/Cargo.toml index 6d0de2f..fdb9a4b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2021" name = "spider_rs" -version = "0.0.48" +version = "0.0.49" repository = "https://github.com/spider-rs/spider-py" license = "MIT" description = "The fastest web crawler and indexer." diff --git a/book/src/website.md b/book/src/website.md index db0ec37..9878600 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -20,6 +20,20 @@ async def main(): asyncio.run(main()) ``` +### Return Page Links + +Return links found on the page resource. + +```py +import asyncio +from spider_rs import Website + +async def main(): + website = Website("https://choosealicense.com").with_return_page_links(True) + +asyncio.run(main()) +``` + ### Custom Headers Add custom HTTP headers to use when crawling/scraping. diff --git a/src/npage.rs b/src/npage.rs index 4250a80..01fc454 100644 --- a/src/npage.rs +++ b/src/npage.rs @@ -1,33 +1,33 @@ -use std::collections::HashMap; - +use crate::page::header_map_to_hash_map; use pyo3::prelude::*; - use spider::{ lazy_static::lazy_static, packages::scraper::{Html, Selector}, }; - -use crate::page::header_map_to_hash_map; +use std::collections::{HashMap, HashSet}; /// a simple page object #[derive(Default, Clone)] #[pyclass] pub struct NPage { #[pyo3(get)] - /// the url found. + /// The url of the resource. pub url: String, #[pyo3(get)] - /// the content of the page found. + /// The content of the page found as UTF-8. pub content: String, #[pyo3(get)] - /// the HTTP status code. + /// The HTTP status code. pub status_code: u16, #[pyo3(get)] - /// the raw content + /// The raw content in bytes. pub raw_content: Option>, #[pyo3(get)] - /// the headers + /// The HTTP headers. pub headers: Option>, + #[pyo3(get)] + /// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true. + pub links: Option>, } /// get the page title. @@ -54,6 +54,15 @@ pub fn new_page(res: &spider::page::Page, raw: bool) -> NPage { Some(ref headers) => Some(header_map_to_hash_map(headers)), _ => None, }, + links: match res.page_links { + Some(ref links) => Some( + links + .iter() + .map(|link| link.as_ref().to_string()) + .collect::>(), + ), + _ => None, + }, } } diff --git a/src/page.rs b/src/page.rs index 79f322f..3101049 100644 --- a/src/page.rs +++ b/src/page.rs @@ -1,5 +1,5 @@ use pyo3::{pyclass, pymethods, PyRef, PyRefMut}; -use spider::{compact_str::CompactString, reqwest::header::HeaderMap}; +use spider::{compact_str::CompactString, hashbrown::HashSet, reqwest::header::HeaderMap}; use std::collections::HashMap; /// a simple page object @@ -15,10 +15,16 @@ pub struct Page { )>, /// the url for the page pub url: String, + /// subdomains being crawled? pub subdomains: Option, + /// tld being crawled? pub tld: Option, + /// The HTTP status code. pub status_code: u16, + /// The HTTP headers. pub headers: Option>, + /// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true. + pub links: Option>, } /// convert a headermap to hashmap diff --git a/src/website.rs b/src/website.rs index 72e68e3..fbcf9c2 100644 --- a/src/website.rs +++ b/src/website.rs @@ -718,6 +718,17 @@ impl Website { slf } + /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled. + pub fn with_return_page_links( + mut slf: PyRefMut<'_, Self>, + return_page_links: bool, + ) -> PyRefMut<'_, Self> { + slf + .inner + .with_return_page_links(return_page_links); + slf + } + /// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled. pub fn with_wait_for_delay( mut slf: PyRefMut<'_, Self>,