From 128b66f4a4fb6bf4f09818428352581b2036c612 Mon Sep 17 00:00:00 2001
From: j-mendez <jeff@a11ywatch.com>
Date: Tue, 27 Aug 2024 15:52:11 -0400
Subject: [PATCH] feat(page): add with_return_page_links configuration [#8]

---
 Cargo.lock          | 14 +++++++-------
 Cargo.toml          |  2 +-
 book/src/website.md | 14 ++++++++++++++
 src/npage.rs        | 29 +++++++++++++++++++----------
 src/page.rs         |  8 +++++++-
 src/website.rs      | 11 +++++++++++
 6 files changed, 59 insertions(+), 19 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 94f4ba7..0fdf7f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2370,9 +2370,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.34"
+version = "0.38.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
+checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f"
 dependencies = [
  "bitflags 2.6.0",
  "errno",
@@ -2427,9 +2427,9 @@ checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0"
 
 [[package]]
 name = "rustls-webpki"
-version = "0.102.6"
+version = "0.102.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e"
+checksum = "84678086bd54edf2b415183ed7a94d0efb049f1b646a33e22a36f3794be6ae56"
 dependencies = [
  "ring",
  "rustls-pki-types",
@@ -2694,9 +2694,9 @@ dependencies = [
 
 [[package]]
 name = "spider"
-version = "2.0.17"
+version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22cbe3e048eb876764dffa17de8532edbeb66ceec9110f6db7441d923c58086a"
+checksum = "6cc111d3f26cbceb70d37fabab58ed4730de66b0d2144dc67805dcdee318d6f0"
 dependencies = [
  "ahash",
  "async-openai",
@@ -2739,7 +2739,7 @@ dependencies = [
 
 [[package]]
 name = "spider_rs"
-version = "0.0.48"
+version = "0.0.49"
 dependencies = [
  "indexmap",
  "num_cpus",
diff --git a/Cargo.toml b/Cargo.toml
index 6d0de2f..fdb9a4b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 edition = "2021"
 name = "spider_rs"
-version = "0.0.48"
+version = "0.0.49"
 repository = "https://github.com/spider-rs/spider-py"
 license = "MIT"
 description = "The fastest web crawler and indexer."
diff --git a/book/src/website.md b/book/src/website.md
index db0ec37..9878600 100644
--- a/book/src/website.md
+++ b/book/src/website.md
@@ -20,6 +20,20 @@ async def main():
 asyncio.run(main())
 ```
 
+### Return Page Links
+
+Return links found on the page resource.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_return_page_links(True)
+
+asyncio.run(main())
+```
+
 ### Custom Headers
 
 Add custom HTTP headers to use when crawling/scraping.
diff --git a/src/npage.rs b/src/npage.rs
index 4250a80..01fc454 100644
--- a/src/npage.rs
+++ b/src/npage.rs
@@ -1,33 +1,33 @@
-use std::collections::HashMap;
-
+use crate::page::header_map_to_hash_map;
 use pyo3::prelude::*;
-
 use spider::{
   lazy_static::lazy_static,
   packages::scraper::{Html, Selector},
 };
-
-use crate::page::header_map_to_hash_map;
+use std::collections::{HashMap, HashSet};
 
 /// a simple page object
 #[derive(Default, Clone)]
 #[pyclass]
 pub struct NPage {
   #[pyo3(get)]
-  /// the url found.
+  /// The url of the resource.
   pub url: String,
   #[pyo3(get)]
-  /// the content of the page found.
+  /// The content of the page found as UTF-8.
   pub content: String,
   #[pyo3(get)]
-  /// the HTTP status code.
+  /// The HTTP status code.
   pub status_code: u16,
   #[pyo3(get)]
-  /// the raw content
+  /// The raw content in bytes.
   pub raw_content: Option<Vec<u8>>,
   #[pyo3(get)]
-  /// the headers
+  /// The HTTP headers.
   pub headers: Option<HashMap<String, String>>,
+  #[pyo3(get)]
+  /// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true.
+  pub links: Option<HashSet<String>>,
 }
 
 /// get the page title.
@@ -54,6 +54,15 @@ pub fn new_page(res: &spider::page::Page, raw: bool) -> NPage {
       Some(ref headers) => Some(header_map_to_hash_map(headers)),
       _ => None,
     },
+    links: match res.page_links {
+      Some(ref links) => Some(
+        links
+          .iter()
+          .map(|link| link.as_ref().to_string())
+          .collect::<HashSet<String>>(),
+      ),
+      _ => None,
+    },
   }
 }
 
diff --git a/src/page.rs b/src/page.rs
index 79f322f..3101049 100644
--- a/src/page.rs
+++ b/src/page.rs
@@ -1,5 +1,5 @@
 use pyo3::{pyclass, pymethods, PyRef, PyRefMut};
-use spider::{compact_str::CompactString, reqwest::header::HeaderMap};
+use spider::{compact_str::CompactString, hashbrown::HashSet, reqwest::header::HeaderMap};
 use std::collections::HashMap;
 
 /// a simple page object
@@ -15,10 +15,16 @@ pub struct Page {
   )>,
   /// the url for the page
   pub url: String,
+  /// subdomains being crawled?
   pub subdomains: Option<bool>,
+  /// tld being crawled?
   pub tld: Option<bool>,
+  /// The HTTP status code.
   pub status_code: u16,
+  /// The HTTP headers.
   pub headers: Option<HashMap<String, String>>,
+  /// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true.
+  pub links: Option<HashSet<String>>,
 }
 
 /// convert a headermap to hashmap
diff --git a/src/website.rs b/src/website.rs
index 72e68e3..fbcf9c2 100644
--- a/src/website.rs
+++ b/src/website.rs
@@ -718,6 +718,17 @@ impl Website {
     slf
   }
 
+  /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
+  pub fn with_return_page_links(
+    mut slf: PyRefMut<'_, Self>,
+    return_page_links: bool,
+  ) -> PyRefMut<'_, Self> {
+    slf
+      .inner
+      .with_return_page_links(return_page_links);
+    slf
+  }
+
   /// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled.
   pub fn with_wait_for_delay(
     mut slf: PyRefMut<'_, Self>,