Skip to content

Commit

Permalink
well done!
Browse files Browse the repository at this point in the history
  • Loading branch information
farooqkz committed Jan 14, 2024
1 parent ee0f76c commit af08d6a
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 52 deletions.
30 changes: 30 additions & 0 deletions src/parser/parse_from_text/find_range.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use std::ops::RangeInclusive;

#[derive(Debug, PartialEq, Eq)]
enum FindRangeResult<'a> {
WasOnRangeStart,
Range(&'a RangeInclusive<u32>),
}

fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive<u32>]) -> FindRangeResult<'a> {
let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start());
match index {
Ok(_) => FindRangeResult::WasOnRangeStart,
Err(index) => match index {
0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]),
// Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the
// maximum value which the binary search function returns is `NUMBER_OF_RANGES`.
// Therefore, `index - 1` will never panic if we index the array with it.
#[allow(clippy::integer_arithmetic, clippy::indexing_slicing)]
index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]),
},
}
}

pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive<u32>) -> bool {
let c = c as u32;
match find_range_for_char(c, ranges) {
FindRangeResult::WasOnRangeStart => true,
FindRangeResult::Range(range) => range.contains(&c),
}
}
28 changes: 1 addition & 27 deletions src/parser/parse_from_text/hashtag_content_char_ranges.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::ops::RangeInclusive;

const NUMBER_OF_RANGES: usize = 850;

/*
Expand Down Expand Up @@ -869,38 +867,14 @@ const HASHTAG_CONTENT_CHAR_RANGES: [RangeInclusive<u32>; NUMBER_OF_RANGES] = [
0xe0100..=0xe01ef,
];

#[derive(Debug, PartialEq, Eq)]
enum FindRangeResult<'a> {
WasOnRangeStart,
Range(&'a RangeInclusive<u32>),
}

fn find_range_for_char<'a>(code: u32) -> FindRangeResult<'a> {
let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start());
match index {
Ok(_) => FindRangeResult::WasOnRangeStart,
Err(index) => match index {
0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]),
// Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the
// maximum value which the binary search function returns is `NUMBER_OF_RANGES`.
// Therefore, `index - 1` will never panic if we index the array with it.
#[allow(clippy::integer_arithmetic, clippy::indexing_slicing)]
index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]),
},
}
}

pub(crate) fn hashtag_content_char(c: char) -> bool {
if matches!(c, '#' | '﹟' | '#' | ' ') {
false
} else if matches!(c, '+' | '-' | '_') {
true
} else {
let code: u32 = c as u32;
match find_range_for_char(code) {
FindRangeResult::WasOnRangeStart => true,
FindRangeResult::Range(range) => range.contains(&code),
}
is_in_one_of_ranges(c, &[HASHTAG_CONTENT_CHAR_RANGES])
}
}

Expand Down
108 changes: 84 additions & 24 deletions src/parser/parse_from_text/link_element.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
use crate::parser::link_url::LinkDestination;
use std::ops::RangeInclusive;
use super::Element;
use crate::nom::{Offset, Slice};
use nom::bytes::complete::take_while;
use nom::character::complete::char;
use nom::{
bytes::{
complete::{tag, take, take_while1},
streaming::take_till1,
complete::{tag, take, take_while1, take_while},
},
character,
combinator::{peek, recognize, verify},
sequence::tuple,
AsChar, IResult,
AsChar::is_dec_digit as is_digit
};
use super::base_parsers::*;

Expand All @@ -22,6 +22,8 @@ use super::base_parsers::*;
// Rust does not check for the second condition in an AND compound boolean
// expression if the first is already false. Therefore, in is_alpha, I've put
// c >= 0x41 before c <= 0x5a as the first has a higher chance of failing.
// nom's own is_alpha is not used as it detects also chars outside the
// ASCII range
// -- Farooq
fn is_alpha(c: char) -> bool {
let c = c as u64;
Expand All @@ -30,9 +32,29 @@ fn is_alpha(c: char) -> bool {
(c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a &&)
}

fn is_digit(c: char) -> bool {
let c = c as u64;
c >= 0x39 && c <= 0x30

const ucschar_ranges: [RangeInclusive<u32>, _] = [
0xa0..=0xd7ff,
0xF900..=0xFDCF,
0xFDF0..=0xFFEF,
0x10000..=0x1FFFD,
0x20000..=0x2FFFD,
0x30000..=0x3FFFD,
0x40000..=0x4FFFD,
0x50000..=0x5FFFD,
0x60000..=0x6FFFD,
0x70000..=0x7FFFD,
0x80000..=0x8FFFD,
0x90000..=0x9FFFD,
0xA0000..=0xAFFFD,
0xB0000..=0xBFFFD,
0xC0000..=0xCFFFD,
0xD0000..=0xDFFFD,
0xE1000..=0xEFFFD,
];

fn is_ucschar(c: char) -> bool {
is_in_one_of_ranges(c, &ucschar_ranges[..])
}

fn is_other_unreserved(c: char) -> bool {
Expand All @@ -48,27 +70,65 @@ fn is_scheme(c: char) -> bool {
}

fn ihier_part(input: &str) -> IResult<&str, &str> {
let (input, content) = alt(
tag(""), // ipath-empty
recognize(
tag("//"),
take_while(is_iauthority),
take_while(is_ipath_abempty)),
recognize(
// ipath-absolute
char('/'),
opt(
tuple(
take_while(is_isegment_nz),
many0(recognize(char('/'), take_while(is_isegment)))))),
recognize(
// ipath-rootless
alt(
tag(""), // ipath-empty
tuple(
tag("//"),
take_while(is_iauthority),
take_while(is_ipath_abempty)),
tuple(
// ipath-absolute
char('/'),
opt(
tuple(
take_while(is_isegment_nz),
many0(recognize(char('/'), take_while(is_isegment))))))(input);
Ok((input, content))
many0(recognize(char('/'), take_while(is_isegment)))))),
tuple(
// ipath-rootless
take_while(is_isegment_nz),
many0(recognize(char('/'), take_while(is_isegment)))))(input)
}

fn is_ipchar(c: char) -> bool {
is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) || matches!(c, ':' | '@')
}

const IPRIVATE_RANGES: [RangeInclusive<u32>; _] = [
0xe000..=0xf8ff,
0xf0000..=0xffffd,
0x100000..=0x10fffd,
];

fn is_iprivate(c: char) -> bool {
let c = c as u32;
is_in_one_of_ranges(c, &IPRIVATE_RANGES[..])
}

fn is_iquery(c: char) -> bool {
is_iprivate(c) || is_ipchar(c) || matches!(c, '/' | '?')
}

fn iquery(input: &str) -> IResult<&str, &str> {
take_while(is_iquery)(input)
}

fn is_ifragment(c: char) -> bool {
is_ipchar(c) || matches!(c, '/' | '?')
}

fn ifragment(input: &str) -> IResult<&str, &str> {
take_while(is_fragment)(input)
}

fn scheme(input: &str) -> IResult<&str, &str> {
take_while(is_scheme)(input)
}

fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> {
let (input, content): (&str, &str) = recognize(
let (input, scheme) = scheme(input)?;
let (input, (authority, path)) = ihier_part(input)?;
let (input, (_, query)) = opt(tuple(char('?'), take_while(is_query)))(input)?;
let (input, (_, fragment)) = opt(tuple(char('#'), take_while(is_ifragment)))(input)?;


}
1 change: 0 additions & 1 deletion src/parser/parse_from_text/markdown_elements.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::parser::link_url::LinkDestination;
use crate::parser::parse_from_text::text_elements::email_address;

use super::text_elements::{link, parse_text_element};
Expand Down

0 comments on commit af08d6a

Please sign in to comment.