Skip to content

Commit

Permalink
implementing IRIs still
Browse files Browse the repository at this point in the history
  • Loading branch information
farooqkz committed Jan 17, 2024
1 parent af08d6a commit 79718c1
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 27 deletions.
25 changes: 24 additions & 1 deletion src/parser/parse_from_text/find_range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@ enum FindRangeResult<'a> {
Range(&'a RangeInclusive<u32>),
}


/// Find a range which `code` might be in it.
///
/// # Description
/// This function gets a sorted slice of inclusive u32 ranges, performs
/// binary search on them and returns a FindRangeResult enum telling
/// which range the `code` might be in. It returns `FindRangeResult::WasOnRangeStart`
/// if the code was exactly on start of a range. Or a `FindRangeResult::Range(range)`
/// which indicates `code` is in `range` or in no ranges.
///
/// # Arguments
///
/// - `code` the u32 to look for a range for.
///
/// - `ranges` a refernce to a slice of `RangeInclusive<u32>`
fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive<u32>]) -> FindRangeResult<'a> {
let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start());
match index {
Expand All @@ -21,7 +36,15 @@ fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive<u32>]) -> FindRan
}
}

pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive<u32>) -> bool {

/// Returns true of `c` is one of the `ranges`, false otherwise.
///
/// # Arguments
///
/// - `c` A character
///
/// - `ranges` A sorted slice of ranges to see if `c` is in anyone of them
pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive<u32>]) -> bool {
let c = c as u32;
match find_range_for_char(c, ranges) {
FindRangeResult::WasOnRangeStart => true,
Expand Down
122 changes: 96 additions & 26 deletions src/parser/parse_from_text/link_element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use nom::{
},
character,
combinator::{peek, recognize, verify},
sequence::tuple,
sequence::{tuple, preceded},
AsChar, IResult,
AsChar::is_dec_digit as is_digit
};
Expand All @@ -33,6 +33,8 @@ fn is_alpha(c: char) -> bool {
}



// These ranges have been extracted from RFC3987, Page 8.
const ucschar_ranges: [RangeInclusive<u32>, _] = [
0xa0..=0xd7ff,
0xF900..=0xFDCF,
Expand All @@ -57,36 +59,91 @@ fn is_ucschar(c: char) -> bool {
is_in_one_of_ranges(c, &ucschar_ranges[..])
}

fn is_unreserved(c: char) -> bool {
is_alpha(c) || is_digit(c) || is_other_unreserved(c)
}

fn is_iunreserved(c: char) -> bool {
is_ucschar(c) || is_unreserved(c)
}

fn is_other_unreserved(c: char) -> bool {
let c = c as u64;
matches!(c, '-' | '_' | '.' | '_' | '~')
matches!(c, '_' | '.' | '_' | '~')
}

fn is_pct_encoded(c: [char; 3]) -> bool {
c[0] == '%' && is_hex_digit(c[1]) && is_hex_digit(c[2])
}

fn is_sub_delim(c: char) -> bool {
matches!(c, '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=')
}

// Here again, order is important. As URLs/IRIs have letters in them
// most of the time and less digits or other characters. --Farooq
fn is_scheme(c: char) -> bool {
is_alpha(c) || is_digit(c) || is_scheme(c)
}

fn ihier_part(input: &str) -> IResult<&str, &str> {
alt(
tag(""), // ipath-empty
tuple(
tag("//"),
take_while(is_iauthority),
take_while(is_ipath_abempty)),
tuple(
// ipath-absolute
char('/'),
opt(
tuple(
take_while(is_isegment_nz),
many0(recognize(char('/'), take_while(is_isegment)))))),
tuple(
// ipath-rootless
take_while(is_isegment_nz),
many0(recognize(char('/'), take_while(is_isegment)))))(input)

fn is_ipv4(c: char) -> bool {
is_digit(c) || c == '.'
}

fn ipv4(input: &str) -> IResult<&str, &str> {
let (input, possible_ipv4) = take_while_m_n(7, 15, is_ipv4)(input);
// This might be an IPv4
let inner_pair = separated_pair(take_while1(is_digit), char('.'), take_while1(is_digit));
let ((part0, part1), (part2, part3)) = separated_pair(inner_pair, char('.'), inner_pair)(input)?;
part0.parse::<u8>()?;
part1.parse::<u8>()?;
part2.parse::<u8>()?;
part3.parse::<u8>()?;
Ok((input, possible_ipv4))
}

fn is_ireg_name(c: char) -> bool {
is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c)
}

fn ip_literal(input: &str) -> IResult<&str, &str> {

}

/// Parse host
///
/// # Description
///
/// Parse host. Returns the rest, the host string and a boolean indicating
/// if it is IPvFuture or IPv6.
fn parse_host(input: &str) -> IResult<&str, &str, bool> {
let (input, host) = ip_literal(input)?;
if host.is_some() {
// It got parsed, then it's an IP Literal meaning
// it's either IPv6 or IPvFuture
Ok((input, host.unwrap(), true))
} else {
let (input, host) = alt((ipv4, take_while(is_ireg_name)))(input)?;
Ok((input, host, false))
}
}

fn iauthority(input: &str) -> IResult<&str, &str, &str, &str, bool> {
let (input, userinfo) = opt(take_while(is_userinfo), char('@'))(input);
let (input, host, is_ipv6) = parse_host(input);
let (input, port) = preceded(char(':'), take_while(is_digit))(input);
Ok((input, userinfo, host, port, is_ipv6))
}

fn ihier_part(input: &str) -> IResult<&str, &str, &str> {
let (input, authority) = preceded(tag("//"), iauthoriy)(input);
let (input, path) = alt(
take_while(is_ipath_abempty),
char(''), // ipath-empty
take_while(is_ipath_absolute),
take_while(is_ipath_rootless)
)(input);
Ok((input, authority, path))
}

fn is_ipchar(c: char) -> bool {
Expand Down Expand Up @@ -124,11 +181,24 @@ fn scheme(input: &str) -> IResult<&str, &str> {
take_while(is_scheme)(input)
}

fn is_alphanum_or_hyphen_minus(char: char) -> bool {
match char {
'-' => true,
_ => char.is_alphanum(),
}
}

fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> {
let (input, scheme) = scheme(input)?;
let (input, (authority, path)) = ihier_part(input)?;
let (input, (_, query)) = opt(tuple(char('?'), take_while(is_query)))(input)?;
let (input, (_, fragment)) = opt(tuple(char('#'), take_while(is_ifragment)))(input)?;


let (input, (userinfo, hostport, is_ipv6), path) = ihier_part(input)?;
let (input, query) = opt(preceed(char('?'), take_while(is_query)))(input)?;
let (input, fragment) = opt(preceed(char('#'), take_while(is_ifragment)))(input)?;
Element::Link {
destination: LinkDestination {
target: input,
hostname: Some(hostport),
punycode: None,
scheme: scheme
}
}
}

0 comments on commit 79718c1

Please sign in to comment.