well done!

deltachat · Jan 14, 2024 · af08d6a · af08d6a
1 parent ee0f76c
commit af08d6a
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 52 deletions.
diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs
@@ -0,0 +1,30 @@
+use std::ops::RangeInclusive;
+
+#[derive(Debug, PartialEq, Eq)]
+enum FindRangeResult<'a> {
+    WasOnRangeStart,
+    Range(&'a RangeInclusive<u32>),
+}
+
+fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive<u32>]) -> FindRangeResult<'a> {
+    let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start());
+    match index {
+        Ok(_) => FindRangeResult::WasOnRangeStart,
+        Err(index) => match index {
+            0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]),
+            // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the
+            // maximum value which the binary search function returns is `NUMBER_OF_RANGES`.
+            // Therefore, `index - 1` will never panic if we index the array with it.
+            #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)]
+            index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]),
+        },
+    }
+}
+
+pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive<u32>) -> bool {
+    let c = c as u32;
+    match find_range_for_char(c, ranges) {
+        FindRangeResult::WasOnRangeStart => true,
+        FindRangeResult::Range(range) => range.contains(&c),
+    }
+}
diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs
@@ -1,5 +1,3 @@
-use std::ops::RangeInclusive;
-
 const NUMBER_OF_RANGES: usize = 850;
 
 /*
@@ -869,38 +867,14 @@ const HASHTAG_CONTENT_CHAR_RANGES: [RangeInclusive<u32>; NUMBER_OF_RANGES] = [
     0xe0100..=0xe01ef,
 ];
 
-#[derive(Debug, PartialEq, Eq)]
-enum FindRangeResult<'a> {
-    WasOnRangeStart,
-    Range(&'a RangeInclusive<u32>),
-}
-
-fn find_range_for_char<'a>(code: u32) -> FindRangeResult<'a> {
-    let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start());
-    match index {
-        Ok(_) => FindRangeResult::WasOnRangeStart,
-        Err(index) => match index {
-            0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]),
-            // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the
-            // maximum value which the binary search function returns is `NUMBER_OF_RANGES`.
-            // Therefore, `index - 1` will never panic if we index the array with it.
-            #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)]
-            index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]),
-        },
-    }
-}
 
 pub(crate) fn hashtag_content_char(c: char) -> bool {
     if matches!(c, '#' | '﹟' | '＃' | ' ') {
         false
     } else if matches!(c, '+' | '-' | '_') {
         true
     } else {
-        let code: u32 = c as u32;
-        match find_range_for_char(code) {
-            FindRangeResult::WasOnRangeStart => true,
-            FindRangeResult::Range(range) => range.contains(&code),
-        }
+        is_in_one_of_ranges(c, &[HASHTAG_CONTENT_CHAR_RANGES])
     }
 }
 

diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs
@@ -1,17 +1,17 @@
 use crate::parser::link_url::LinkDestination;
+use std::ops::RangeInclusive;
 use super::Element;
 use crate::nom::{Offset, Slice};
-use nom::bytes::complete::take_while;
 use nom::character::complete::char;
 use nom::{
     bytes::{
-        complete::{tag, take, take_while1},
-        streaming::take_till1,
+        complete::{tag, take, take_while1, take_while},
     },
     character,
     combinator::{peek, recognize, verify},
     sequence::tuple,
     AsChar, IResult,
+    AsChar::is_dec_digit as is_digit
 };
 use super::base_parsers::*;
 
@@ -22,6 +22,8 @@ use super::base_parsers::*;
 // Rust does not check for the second condition in an AND compound boolean
 // expression if the first is already false. Therefore, in is_alpha, I've put 
 // c >= 0x41 before c <= 0x5a as the first has a higher chance of failing.
+// nom's own is_alpha is not used as it detects also chars outside the 
+// ASCII range
 // -- Farooq
 fn is_alpha(c: char) -> bool {
     let c = c as u64;
@@ -30,9 +32,29 @@ fn is_alpha(c: char) -> bool {
     (c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a &&)
 }
 
-fn is_digit(c: char) -> bool {
-    let c = c as u64;
-    c >= 0x39 && c <= 0x30
+
+const ucschar_ranges: [RangeInclusive<u32>, _] = [
+    0xa0..=0xd7ff,
+    0xF900..=0xFDCF,
+    0xFDF0..=0xFFEF,
+    0x10000..=0x1FFFD,
+    0x20000..=0x2FFFD,
+    0x30000..=0x3FFFD,
+    0x40000..=0x4FFFD,
+    0x50000..=0x5FFFD,
+    0x60000..=0x6FFFD,
+    0x70000..=0x7FFFD,
+    0x80000..=0x8FFFD,
+    0x90000..=0x9FFFD,
+    0xA0000..=0xAFFFD,
+    0xB0000..=0xBFFFD,
+    0xC0000..=0xCFFFD,
+    0xD0000..=0xDFFFD,
+    0xE1000..=0xEFFFD,
+];
+
+fn is_ucschar(c: char) -> bool {
+    is_in_one_of_ranges(c, &ucschar_ranges[..])
 }
 
 fn is_other_unreserved(c: char) -> bool {
@@ -48,27 +70,65 @@ fn is_scheme(c: char) -> bool {
 }
 
 fn ihier_part(input: &str) -> IResult<&str, &str> {
-    let (input, content) = alt(
-            tag(""), // ipath-empty
-            recognize(
-                tag("//"), 
-                take_while(is_iauthority),
-                take_while(is_ipath_abempty)),
-            recognize(
-                // ipath-absolute
-                char('/'),
-                opt(
-                    tuple(
-                        take_while(is_isegment_nz),
-                        many0(recognize(char('/'), take_while(is_isegment)))))),
-            recognize(
-                // ipath-rootless
+    alt(
+        tag(""), // ipath-empty
+        tuple(
+            tag("//"), 
+            take_while(is_iauthority),
+            take_while(is_ipath_abempty)),
+        tuple(
+            // ipath-absolute
+            char('/'),
+            opt(
                 tuple(
                     take_while(is_isegment_nz),
-                    many0(recognize(char('/'), take_while(is_isegment))))))(input);
-    Ok((input, content)) 
+                    many0(recognize(char('/'), take_while(is_isegment)))))),
+        tuple(
+            // ipath-rootless
+            take_while(is_isegment_nz),
+            many0(recognize(char('/'), take_while(is_isegment)))))(input)
+}
+
+fn is_ipchar(c: char) -> bool {
+    is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) || matches!(c, ':' | '@')
+}
+
+const IPRIVATE_RANGES: [RangeInclusive<u32>; _]  = [
+    0xe000..=0xf8ff,
+    0xf0000..=0xffffd,
+    0x100000..=0x10fffd,
+];
+
+fn is_iprivate(c: char) -> bool {
+    let c = c as u32;
+    is_in_one_of_ranges(c, &IPRIVATE_RANGES[..])
+}
+
+fn is_iquery(c: char) -> bool {
+    is_iprivate(c) || is_ipchar(c) || matches!(c, '/' | '?')
+}
+
+fn iquery(input: &str) -> IResult<&str, &str> {
+    take_while(is_iquery)(input)
+}
+
+fn is_ifragment(c: char) -> bool {
+    is_ipchar(c) || matches!(c, '/' | '?')
+}
+
+fn ifragment(input: &str) -> IResult<&str, &str> {
+    take_while(is_fragment)(input)
+}
+
+fn scheme(input: &str) -> IResult<&str, &str> {
+    take_while(is_scheme)(input)
 }
 
 fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> {
-    let (input, content): (&str, &str) = recognize(
+    let (input, scheme) = scheme(input)?;
+    let (input, (authority, path)) = ihier_part(input)?;
+    let (input, (_, query)) = opt(tuple(char('?'), take_while(is_query)))(input)?;
+    let (input, (_, fragment)) = opt(tuple(char('#'), take_while(is_ifragment)))(input)?;
+
+
 }
diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs
@@ -1,4 +1,3 @@
-use crate::parser::link_url::LinkDestination;
 use crate::parser::parse_from_text::text_elements::email_address;
 
 use super::text_elements::{link, parse_text_element};