diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs new file mode 100644 index 0000000..0464696 --- /dev/null +++ b/src/parser/parse_from_text/find_range.rs @@ -0,0 +1,30 @@ +use std::ops::RangeInclusive; + +#[derive(Debug, PartialEq, Eq)] +enum FindRangeResult<'a> { + WasOnRangeStart, + Range(&'a RangeInclusive), +} + +fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRangeResult<'a> { + let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start()); + match index { + Ok(_) => FindRangeResult::WasOnRangeStart, + Err(index) => match index { + 0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]), + // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the + // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. + // Therefore, `index - 1` will never panic if we index the array with it. + #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] + index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]), + }, + } +} + +pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive) -> bool { + let c = c as u32; + match find_range_for_char(c, ranges) { + FindRangeResult::WasOnRangeStart => true, + FindRangeResult::Range(range) => range.contains(&c), + } +} diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index 7d9ea19..f093bd1 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -1,5 +1,3 @@ -use std::ops::RangeInclusive; - const NUMBER_OF_RANGES: usize = 850; /* @@ -869,26 +867,6 @@ const HASHTAG_CONTENT_CHAR_RANGES: [RangeInclusive; NUMBER_OF_RANGES] = [ 0xe0100..=0xe01ef, ]; -#[derive(Debug, PartialEq, Eq)] -enum FindRangeResult<'a> { - WasOnRangeStart, - Range(&'a RangeInclusive), -} - -fn find_range_for_char<'a>(code: u32) -> FindRangeResult<'a> { - let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start()); - match index { - Ok(_) => FindRangeResult::WasOnRangeStart, - Err(index) => match index { - 0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]), - // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the - // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. - // Therefore, `index - 1` will never panic if we index the array with it. - #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] - index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]), - }, - } -} pub(crate) fn hashtag_content_char(c: char) -> bool { if matches!(c, '#' | '﹟' | '#' | ' ') { @@ -896,11 +874,7 @@ pub(crate) fn hashtag_content_char(c: char) -> bool { } else if matches!(c, '+' | '-' | '_') { true } else { - let code: u32 = c as u32; - match find_range_for_char(code) { - FindRangeResult::WasOnRangeStart => true, - FindRangeResult::Range(range) => range.contains(&code), - } + is_in_one_of_ranges(c, &[HASHTAG_CONTENT_CHAR_RANGES]) } } diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index c356284..da882ef 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -1,17 +1,17 @@ use crate::parser::link_url::LinkDestination; +use std::ops::RangeInclusive; use super::Element; use crate::nom::{Offset, Slice}; -use nom::bytes::complete::take_while; use nom::character::complete::char; use nom::{ bytes::{ - complete::{tag, take, take_while1}, - streaming::take_till1, + complete::{tag, take, take_while1, take_while}, }, character, combinator::{peek, recognize, verify}, sequence::tuple, AsChar, IResult, + AsChar::is_dec_digit as is_digit }; use super::base_parsers::*; @@ -22,6 +22,8 @@ use super::base_parsers::*; // Rust does not check for the second condition in an AND compound boolean // expression if the first is already false. Therefore, in is_alpha, I've put // c >= 0x41 before c <= 0x5a as the first has a higher chance of failing. +// nom's own is_alpha is not used as it detects also chars outside the +// ASCII range // -- Farooq fn is_alpha(c: char) -> bool { let c = c as u64; @@ -30,9 +32,29 @@ fn is_alpha(c: char) -> bool { (c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a &&) } -fn is_digit(c: char) -> bool { - let c = c as u64; - c >= 0x39 && c <= 0x30 + +const ucschar_ranges: [RangeInclusive, _] = [ + 0xa0..=0xd7ff, + 0xF900..=0xFDCF, + 0xFDF0..=0xFFEF, + 0x10000..=0x1FFFD, + 0x20000..=0x2FFFD, + 0x30000..=0x3FFFD, + 0x40000..=0x4FFFD, + 0x50000..=0x5FFFD, + 0x60000..=0x6FFFD, + 0x70000..=0x7FFFD, + 0x80000..=0x8FFFD, + 0x90000..=0x9FFFD, + 0xA0000..=0xAFFFD, + 0xB0000..=0xBFFFD, + 0xC0000..=0xCFFFD, + 0xD0000..=0xDFFFD, + 0xE1000..=0xEFFFD, +]; + +fn is_ucschar(c: char) -> bool { + is_in_one_of_ranges(c, &ucschar_ranges[..]) } fn is_other_unreserved(c: char) -> bool { @@ -48,27 +70,65 @@ fn is_scheme(c: char) -> bool { } fn ihier_part(input: &str) -> IResult<&str, &str> { - let (input, content) = alt( - tag(""), // ipath-empty - recognize( - tag("//"), - take_while(is_iauthority), - take_while(is_ipath_abempty)), - recognize( - // ipath-absolute - char('/'), - opt( - tuple( - take_while(is_isegment_nz), - many0(recognize(char('/'), take_while(is_isegment)))))), - recognize( - // ipath-rootless + alt( + tag(""), // ipath-empty + tuple( + tag("//"), + take_while(is_iauthority), + take_while(is_ipath_abempty)), + tuple( + // ipath-absolute + char('/'), + opt( tuple( take_while(is_isegment_nz), - many0(recognize(char('/'), take_while(is_isegment))))))(input); - Ok((input, content)) + many0(recognize(char('/'), take_while(is_isegment)))))), + tuple( + // ipath-rootless + take_while(is_isegment_nz), + many0(recognize(char('/'), take_while(is_isegment)))))(input) +} + +fn is_ipchar(c: char) -> bool { + is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) || matches!(c, ':' | '@') +} + +const IPRIVATE_RANGES: [RangeInclusive; _] = [ + 0xe000..=0xf8ff, + 0xf0000..=0xffffd, + 0x100000..=0x10fffd, +]; + +fn is_iprivate(c: char) -> bool { + let c = c as u32; + is_in_one_of_ranges(c, &IPRIVATE_RANGES[..]) +} + +fn is_iquery(c: char) -> bool { + is_iprivate(c) || is_ipchar(c) || matches!(c, '/' | '?') +} + +fn iquery(input: &str) -> IResult<&str, &str> { + take_while(is_iquery)(input) +} + +fn is_ifragment(c: char) -> bool { + is_ipchar(c) || matches!(c, '/' | '?') +} + +fn ifragment(input: &str) -> IResult<&str, &str> { + take_while(is_fragment)(input) +} + +fn scheme(input: &str) -> IResult<&str, &str> { + take_while(is_scheme)(input) } fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { - let (input, content): (&str, &str) = recognize( + let (input, scheme) = scheme(input)?; + let (input, (authority, path)) = ihier_part(input)?; + let (input, (_, query)) = opt(tuple(char('?'), take_while(is_query)))(input)?; + let (input, (_, fragment)) = opt(tuple(char('#'), take_while(is_ifragment)))(input)?; + + } diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 35a9a88..06fbd1e 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -1,4 +1,3 @@ -use crate::parser::link_url::LinkDestination; use crate::parser::parse_from_text::text_elements::email_address; use super::text_elements::{link, parse_text_element};