From 79718c16f8a1d38440f249de856f05a0ce12fcac Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Wed, 17 Jan 2024 12:37:38 +0330 Subject: [PATCH] implementing IRIs still --- src/parser/parse_from_text/find_range.rs | 25 ++++- src/parser/parse_from_text/link_element.rs | 122 ++++++++++++++++----- 2 files changed, 120 insertions(+), 27 deletions(-) diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs index 0464696..840a081 100644 --- a/src/parser/parse_from_text/find_range.rs +++ b/src/parser/parse_from_text/find_range.rs @@ -6,6 +6,21 @@ enum FindRangeResult<'a> { Range(&'a RangeInclusive), } + +/// Find a range which `code` might be in it. +/// +/// # Description +/// This function gets a sorted slice of inclusive u32 ranges, performs +/// binary search on them and returns a FindRangeResult enum telling +/// which range the `code` might be in. It returns `FindRangeResult::WasOnRangeStart` +/// if the code was exactly on start of a range. Or a `FindRangeResult::Range(range)` +/// which indicates `code` is in `range` or in no ranges. +/// +/// # Arguments +/// +/// - `code` the u32 to look for a range for. +/// +/// - `ranges` a refernce to a slice of `RangeInclusive` fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRangeResult<'a> { let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start()); match index { @@ -21,7 +36,15 @@ fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRan } } -pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive) -> bool { + +/// Returns true of `c` is one of the `ranges`, false otherwise. +/// +/// # Arguments +/// +/// - `c` A character +/// +/// - `ranges` A sorted slice of ranges to see if `c` is in anyone of them +pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive]) -> bool { let c = c as u32; match find_range_for_char(c, ranges) { FindRangeResult::WasOnRangeStart => true, diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index da882ef..543fd9f 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -9,7 +9,7 @@ use nom::{ }, character, combinator::{peek, recognize, verify}, - sequence::tuple, + sequence::{tuple, preceded}, AsChar, IResult, AsChar::is_dec_digit as is_digit }; @@ -33,6 +33,8 @@ fn is_alpha(c: char) -> bool { } + +// These ranges have been extracted from RFC3987, Page 8. const ucschar_ranges: [RangeInclusive, _] = [ 0xa0..=0xd7ff, 0xF900..=0xFDCF, @@ -57,11 +59,25 @@ fn is_ucschar(c: char) -> bool { is_in_one_of_ranges(c, &ucschar_ranges[..]) } +fn is_unreserved(c: char) -> bool { + is_alpha(c) || is_digit(c) || is_other_unreserved(c) +} + +fn is_iunreserved(c: char) -> bool { + is_ucschar(c) || is_unreserved(c) +} + fn is_other_unreserved(c: char) -> bool { - let c = c as u64; - matches!(c, '-' | '_' | '.' | '_' | '~') + matches!(c, '_' | '.' | '_' | '~') } +fn is_pct_encoded(c: [char; 3]) -> bool { + c[0] == '%' && is_hex_digit(c[1]) && is_hex_digit(c[2]) +} + +fn is_sub_delim(c: char) -> bool { + matches!(c, '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=') +} // Here again, order is important. As URLs/IRIs have letters in them // most of the time and less digits or other characters. --Farooq @@ -69,24 +85,65 @@ fn is_scheme(c: char) -> bool { is_alpha(c) || is_digit(c) || is_scheme(c) } -fn ihier_part(input: &str) -> IResult<&str, &str> { - alt( - tag(""), // ipath-empty - tuple( - tag("//"), - take_while(is_iauthority), - take_while(is_ipath_abempty)), - tuple( - // ipath-absolute - char('/'), - opt( - tuple( - take_while(is_isegment_nz), - many0(recognize(char('/'), take_while(is_isegment)))))), - tuple( - // ipath-rootless - take_while(is_isegment_nz), - many0(recognize(char('/'), take_while(is_isegment)))))(input) + +fn is_ipv4(c: char) -> bool { + is_digit(c) || c == '.' +} + +fn ipv4(input: &str) -> IResult<&str, &str> { + let (input, possible_ipv4) = take_while_m_n(7, 15, is_ipv4)(input); + // This might be an IPv4 + let inner_pair = separated_pair(take_while1(is_digit), char('.'), take_while1(is_digit)); + let ((part0, part1), (part2, part3)) = separated_pair(inner_pair, char('.'), inner_pair)(input)?; + part0.parse::()?; + part1.parse::()?; + part2.parse::()?; + part3.parse::()?; + Ok((input, possible_ipv4)) +} + +fn is_ireg_name(c: char) -> bool { + is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) +} + +fn ip_literal(input: &str) -> IResult<&str, &str> { + +} + +/// Parse host +/// +/// # Description +/// +/// Parse host. Returns the rest, the host string and a boolean indicating +/// if it is IPvFuture or IPv6. +fn parse_host(input: &str) -> IResult<&str, &str, bool> { + let (input, host) = ip_literal(input)?; + if host.is_some() { + // It got parsed, then it's an IP Literal meaning + // it's either IPv6 or IPvFuture + Ok((input, host.unwrap(), true)) + } else { + let (input, host) = alt((ipv4, take_while(is_ireg_name)))(input)?; + Ok((input, host, false)) + } +} + +fn iauthority(input: &str) -> IResult<&str, &str, &str, &str, bool> { + let (input, userinfo) = opt(take_while(is_userinfo), char('@'))(input); + let (input, host, is_ipv6) = parse_host(input); + let (input, port) = preceded(char(':'), take_while(is_digit))(input); + Ok((input, userinfo, host, port, is_ipv6)) +} + +fn ihier_part(input: &str) -> IResult<&str, &str, &str> { + let (input, authority) = preceded(tag("//"), iauthoriy)(input); + let (input, path) = alt( + take_while(is_ipath_abempty), + char(''), // ipath-empty + take_while(is_ipath_absolute), + take_while(is_ipath_rootless) + )(input); + Ok((input, authority, path)) } fn is_ipchar(c: char) -> bool { @@ -124,11 +181,24 @@ fn scheme(input: &str) -> IResult<&str, &str> { take_while(is_scheme)(input) } +fn is_alphanum_or_hyphen_minus(char: char) -> bool { + match char { + '-' => true, + _ => char.is_alphanum(), + } +} + fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { let (input, scheme) = scheme(input)?; - let (input, (authority, path)) = ihier_part(input)?; - let (input, (_, query)) = opt(tuple(char('?'), take_while(is_query)))(input)?; - let (input, (_, fragment)) = opt(tuple(char('#'), take_while(is_ifragment)))(input)?; - - + let (input, (userinfo, hostport, is_ipv6), path) = ihier_part(input)?; + let (input, query) = opt(preceed(char('?'), take_while(is_query)))(input)?; + let (input, fragment) = opt(preceed(char('#'), take_while(is_ifragment)))(input)?; + Element::Link { + destination: LinkDestination { + target: input, + hostname: Some(hostport), + punycode: None, + scheme: scheme + } + } }