From 55d358c82d6160ff3812f2748dbac609edaef608 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Wed, 10 Jan 2024 21:07:56 +0330 Subject: [PATCH 01/74] starting to work with #16 --- src/parser/parse_from_text/link_element.rs | 27 +++++++++++++++++++++ src/parser/parse_from_text/text_elements.rs | 5 ++++ 2 files changed, 32 insertions(+) create mode 100644 src/parser/parse_from_text/link_element.rs diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs new file mode 100644 index 0000000..6ab5853 --- /dev/null +++ b/src/parser/parse_from_text/link_element.rs @@ -0,0 +1,27 @@ +use crate::parser::link_url::LinkDestination; +use super::Element; +use crate::nom::{Offset, Slice}; +use nom::bytes::complete::take_while; +use nom::character::complete::char; +use nom::{ + bytes::{ + complete::{tag, take, take_while1}, + streaming::take_till1, + }, + character, + combinator::{peek, recognize, verify}, + sequence::tuple, + AsChar, IResult, +}; +use super::base_parsers::*; + +// Link syntax here is according to RFC 3986 & 3987 --Farooq + + +fn is_alpha(c: char) -> bool { + let c = c as u64; + // basically in inclusive ranges of [0x40, 0x5a] OR + // [0x61, 0x7a] + // TODO: order the conditions for better performance + c >= 0x41 && c <= 0x7a && c <= 0x5a && c >= 0x61 +} diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 6914cd6..476ea31 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -98,6 +98,11 @@ fn not_link_part_char(c: char) -> bool { !matches!(c, ':' | '\n' | '\r' | '\t' | ' ') } + +fn link(input: &str) -> IResult<&str, (), CustomError<&str>> { + let (input, _) = take_while1(link_scheme)(input)?; +} + /// rough recognition of an link, results gets checked by a real link parser fn link_intern(input: &str) -> IResult<&str, (), CustomError<&str>> { let (input, _) = take_while1(not_link_part_char)(input)?; From ee0f76cf28c751f7b2f560caa1d310b38d33639f Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 14 Jan 2024 18:55:41 +0330 Subject: [PATCH 02/74] ihier part complete --- src/parser/parse_from_text/link_element.rs | 53 ++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 6ab5853..c356284 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -18,10 +18,57 @@ use super::base_parsers::*; // Link syntax here is according to RFC 3986 & 3987 --Farooq +// In these fucntions checking for ranges, order is important. Remember that +// Rust does not check for the second condition in an AND compound boolean +// expression if the first is already false. Therefore, in is_alpha, I've put +// c >= 0x41 before c <= 0x5a as the first has a higher chance of failing. +// -- Farooq fn is_alpha(c: char) -> bool { let c = c as u64; - // basically in inclusive ranges of [0x40, 0x5a] OR + // basically in inclusive ranges of [0x41, 0x5a] OR // [0x61, 0x7a] - // TODO: order the conditions for better performance - c >= 0x41 && c <= 0x7a && c <= 0x5a && c >= 0x61 + (c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a &&) +} + +fn is_digit(c: char) -> bool { + let c = c as u64; + c >= 0x39 && c <= 0x30 +} + +fn is_other_unreserved(c: char) -> bool { + let c = c as u64; + matches!(c, '-' | '_' | '.' | '_' | '~') +} + + +// Here again, order is important. As URLs/IRIs have letters in them +// most of the time and less digits or other characters. --Farooq +fn is_scheme(c: char) -> bool { + is_alpha(c) || is_digit(c) || is_scheme(c) +} + +fn ihier_part(input: &str) -> IResult<&str, &str> { + let (input, content) = alt( + tag(""), // ipath-empty + recognize( + tag("//"), + take_while(is_iauthority), + take_while(is_ipath_abempty)), + recognize( + // ipath-absolute + char('/'), + opt( + tuple( + take_while(is_isegment_nz), + many0(recognize(char('/'), take_while(is_isegment)))))), + recognize( + // ipath-rootless + tuple( + take_while(is_isegment_nz), + many0(recognize(char('/'), take_while(is_isegment))))))(input); + Ok((input, content)) +} + +fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { + let (input, content): (&str, &str) = recognize( } From af08d6a29575002cb6dc42d6ceaf215059b2a195 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 14 Jan 2024 20:30:05 +0330 Subject: [PATCH 03/74] well done! --- src/parser/parse_from_text/find_range.rs | 30 +++++ .../hashtag_content_char_ranges.rs | 28 +---- src/parser/parse_from_text/link_element.rs | 108 ++++++++++++++---- .../parse_from_text/markdown_elements.rs | 1 - 4 files changed, 115 insertions(+), 52 deletions(-) create mode 100644 src/parser/parse_from_text/find_range.rs diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs new file mode 100644 index 0000000..0464696 --- /dev/null +++ b/src/parser/parse_from_text/find_range.rs @@ -0,0 +1,30 @@ +use std::ops::RangeInclusive; + +#[derive(Debug, PartialEq, Eq)] +enum FindRangeResult<'a> { + WasOnRangeStart, + Range(&'a RangeInclusive), +} + +fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRangeResult<'a> { + let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start()); + match index { + Ok(_) => FindRangeResult::WasOnRangeStart, + Err(index) => match index { + 0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]), + // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the + // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. + // Therefore, `index - 1` will never panic if we index the array with it. + #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] + index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]), + }, + } +} + +pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive) -> bool { + let c = c as u32; + match find_range_for_char(c, ranges) { + FindRangeResult::WasOnRangeStart => true, + FindRangeResult::Range(range) => range.contains(&c), + } +} diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index 7d9ea19..f093bd1 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -1,5 +1,3 @@ -use std::ops::RangeInclusive; - const NUMBER_OF_RANGES: usize = 850; /* @@ -869,26 +867,6 @@ const HASHTAG_CONTENT_CHAR_RANGES: [RangeInclusive; NUMBER_OF_RANGES] = [ 0xe0100..=0xe01ef, ]; -#[derive(Debug, PartialEq, Eq)] -enum FindRangeResult<'a> { - WasOnRangeStart, - Range(&'a RangeInclusive), -} - -fn find_range_for_char<'a>(code: u32) -> FindRangeResult<'a> { - let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start()); - match index { - Ok(_) => FindRangeResult::WasOnRangeStart, - Err(index) => match index { - 0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]), - // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the - // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. - // Therefore, `index - 1` will never panic if we index the array with it. - #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] - index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]), - }, - } -} pub(crate) fn hashtag_content_char(c: char) -> bool { if matches!(c, '#' | '﹟' | '#' | ' ') { @@ -896,11 +874,7 @@ pub(crate) fn hashtag_content_char(c: char) -> bool { } else if matches!(c, '+' | '-' | '_') { true } else { - let code: u32 = c as u32; - match find_range_for_char(code) { - FindRangeResult::WasOnRangeStart => true, - FindRangeResult::Range(range) => range.contains(&code), - } + is_in_one_of_ranges(c, &[HASHTAG_CONTENT_CHAR_RANGES]) } } diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index c356284..da882ef 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -1,17 +1,17 @@ use crate::parser::link_url::LinkDestination; +use std::ops::RangeInclusive; use super::Element; use crate::nom::{Offset, Slice}; -use nom::bytes::complete::take_while; use nom::character::complete::char; use nom::{ bytes::{ - complete::{tag, take, take_while1}, - streaming::take_till1, + complete::{tag, take, take_while1, take_while}, }, character, combinator::{peek, recognize, verify}, sequence::tuple, AsChar, IResult, + AsChar::is_dec_digit as is_digit }; use super::base_parsers::*; @@ -22,6 +22,8 @@ use super::base_parsers::*; // Rust does not check for the second condition in an AND compound boolean // expression if the first is already false. Therefore, in is_alpha, I've put // c >= 0x41 before c <= 0x5a as the first has a higher chance of failing. +// nom's own is_alpha is not used as it detects also chars outside the +// ASCII range // -- Farooq fn is_alpha(c: char) -> bool { let c = c as u64; @@ -30,9 +32,29 @@ fn is_alpha(c: char) -> bool { (c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a &&) } -fn is_digit(c: char) -> bool { - let c = c as u64; - c >= 0x39 && c <= 0x30 + +const ucschar_ranges: [RangeInclusive, _] = [ + 0xa0..=0xd7ff, + 0xF900..=0xFDCF, + 0xFDF0..=0xFFEF, + 0x10000..=0x1FFFD, + 0x20000..=0x2FFFD, + 0x30000..=0x3FFFD, + 0x40000..=0x4FFFD, + 0x50000..=0x5FFFD, + 0x60000..=0x6FFFD, + 0x70000..=0x7FFFD, + 0x80000..=0x8FFFD, + 0x90000..=0x9FFFD, + 0xA0000..=0xAFFFD, + 0xB0000..=0xBFFFD, + 0xC0000..=0xCFFFD, + 0xD0000..=0xDFFFD, + 0xE1000..=0xEFFFD, +]; + +fn is_ucschar(c: char) -> bool { + is_in_one_of_ranges(c, &ucschar_ranges[..]) } fn is_other_unreserved(c: char) -> bool { @@ -48,27 +70,65 @@ fn is_scheme(c: char) -> bool { } fn ihier_part(input: &str) -> IResult<&str, &str> { - let (input, content) = alt( - tag(""), // ipath-empty - recognize( - tag("//"), - take_while(is_iauthority), - take_while(is_ipath_abempty)), - recognize( - // ipath-absolute - char('/'), - opt( - tuple( - take_while(is_isegment_nz), - many0(recognize(char('/'), take_while(is_isegment)))))), - recognize( - // ipath-rootless + alt( + tag(""), // ipath-empty + tuple( + tag("//"), + take_while(is_iauthority), + take_while(is_ipath_abempty)), + tuple( + // ipath-absolute + char('/'), + opt( tuple( take_while(is_isegment_nz), - many0(recognize(char('/'), take_while(is_isegment))))))(input); - Ok((input, content)) + many0(recognize(char('/'), take_while(is_isegment)))))), + tuple( + // ipath-rootless + take_while(is_isegment_nz), + many0(recognize(char('/'), take_while(is_isegment)))))(input) +} + +fn is_ipchar(c: char) -> bool { + is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) || matches!(c, ':' | '@') +} + +const IPRIVATE_RANGES: [RangeInclusive; _] = [ + 0xe000..=0xf8ff, + 0xf0000..=0xffffd, + 0x100000..=0x10fffd, +]; + +fn is_iprivate(c: char) -> bool { + let c = c as u32; + is_in_one_of_ranges(c, &IPRIVATE_RANGES[..]) +} + +fn is_iquery(c: char) -> bool { + is_iprivate(c) || is_ipchar(c) || matches!(c, '/' | '?') +} + +fn iquery(input: &str) -> IResult<&str, &str> { + take_while(is_iquery)(input) +} + +fn is_ifragment(c: char) -> bool { + is_ipchar(c) || matches!(c, '/' | '?') +} + +fn ifragment(input: &str) -> IResult<&str, &str> { + take_while(is_fragment)(input) +} + +fn scheme(input: &str) -> IResult<&str, &str> { + take_while(is_scheme)(input) } fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { - let (input, content): (&str, &str) = recognize( + let (input, scheme) = scheme(input)?; + let (input, (authority, path)) = ihier_part(input)?; + let (input, (_, query)) = opt(tuple(char('?'), take_while(is_query)))(input)?; + let (input, (_, fragment)) = opt(tuple(char('#'), take_while(is_ifragment)))(input)?; + + } diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 35a9a88..06fbd1e 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -1,4 +1,3 @@ -use crate::parser::link_url::LinkDestination; use crate::parser::parse_from_text::text_elements::email_address; use super::text_elements::{link, parse_text_element}; From 79718c16f8a1d38440f249de856f05a0ce12fcac Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Wed, 17 Jan 2024 12:37:38 +0330 Subject: [PATCH 04/74] implementing IRIs still --- src/parser/parse_from_text/find_range.rs | 25 ++++- src/parser/parse_from_text/link_element.rs | 122 ++++++++++++++++----- 2 files changed, 120 insertions(+), 27 deletions(-) diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs index 0464696..840a081 100644 --- a/src/parser/parse_from_text/find_range.rs +++ b/src/parser/parse_from_text/find_range.rs @@ -6,6 +6,21 @@ enum FindRangeResult<'a> { Range(&'a RangeInclusive), } + +/// Find a range which `code` might be in it. +/// +/// # Description +/// This function gets a sorted slice of inclusive u32 ranges, performs +/// binary search on them and returns a FindRangeResult enum telling +/// which range the `code` might be in. It returns `FindRangeResult::WasOnRangeStart` +/// if the code was exactly on start of a range. Or a `FindRangeResult::Range(range)` +/// which indicates `code` is in `range` or in no ranges. +/// +/// # Arguments +/// +/// - `code` the u32 to look for a range for. +/// +/// - `ranges` a refernce to a slice of `RangeInclusive` fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRangeResult<'a> { let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start()); match index { @@ -21,7 +36,15 @@ fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRan } } -pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive) -> bool { + +/// Returns true of `c` is one of the `ranges`, false otherwise. +/// +/// # Arguments +/// +/// - `c` A character +/// +/// - `ranges` A sorted slice of ranges to see if `c` is in anyone of them +pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive]) -> bool { let c = c as u32; match find_range_for_char(c, ranges) { FindRangeResult::WasOnRangeStart => true, diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index da882ef..543fd9f 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -9,7 +9,7 @@ use nom::{ }, character, combinator::{peek, recognize, verify}, - sequence::tuple, + sequence::{tuple, preceded}, AsChar, IResult, AsChar::is_dec_digit as is_digit }; @@ -33,6 +33,8 @@ fn is_alpha(c: char) -> bool { } + +// These ranges have been extracted from RFC3987, Page 8. const ucschar_ranges: [RangeInclusive, _] = [ 0xa0..=0xd7ff, 0xF900..=0xFDCF, @@ -57,11 +59,25 @@ fn is_ucschar(c: char) -> bool { is_in_one_of_ranges(c, &ucschar_ranges[..]) } +fn is_unreserved(c: char) -> bool { + is_alpha(c) || is_digit(c) || is_other_unreserved(c) +} + +fn is_iunreserved(c: char) -> bool { + is_ucschar(c) || is_unreserved(c) +} + fn is_other_unreserved(c: char) -> bool { - let c = c as u64; - matches!(c, '-' | '_' | '.' | '_' | '~') + matches!(c, '_' | '.' | '_' | '~') } +fn is_pct_encoded(c: [char; 3]) -> bool { + c[0] == '%' && is_hex_digit(c[1]) && is_hex_digit(c[2]) +} + +fn is_sub_delim(c: char) -> bool { + matches!(c, '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=') +} // Here again, order is important. As URLs/IRIs have letters in them // most of the time and less digits or other characters. --Farooq @@ -69,24 +85,65 @@ fn is_scheme(c: char) -> bool { is_alpha(c) || is_digit(c) || is_scheme(c) } -fn ihier_part(input: &str) -> IResult<&str, &str> { - alt( - tag(""), // ipath-empty - tuple( - tag("//"), - take_while(is_iauthority), - take_while(is_ipath_abempty)), - tuple( - // ipath-absolute - char('/'), - opt( - tuple( - take_while(is_isegment_nz), - many0(recognize(char('/'), take_while(is_isegment)))))), - tuple( - // ipath-rootless - take_while(is_isegment_nz), - many0(recognize(char('/'), take_while(is_isegment)))))(input) + +fn is_ipv4(c: char) -> bool { + is_digit(c) || c == '.' +} + +fn ipv4(input: &str) -> IResult<&str, &str> { + let (input, possible_ipv4) = take_while_m_n(7, 15, is_ipv4)(input); + // This might be an IPv4 + let inner_pair = separated_pair(take_while1(is_digit), char('.'), take_while1(is_digit)); + let ((part0, part1), (part2, part3)) = separated_pair(inner_pair, char('.'), inner_pair)(input)?; + part0.parse::()?; + part1.parse::()?; + part2.parse::()?; + part3.parse::()?; + Ok((input, possible_ipv4)) +} + +fn is_ireg_name(c: char) -> bool { + is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) +} + +fn ip_literal(input: &str) -> IResult<&str, &str> { + +} + +/// Parse host +/// +/// # Description +/// +/// Parse host. Returns the rest, the host string and a boolean indicating +/// if it is IPvFuture or IPv6. +fn parse_host(input: &str) -> IResult<&str, &str, bool> { + let (input, host) = ip_literal(input)?; + if host.is_some() { + // It got parsed, then it's an IP Literal meaning + // it's either IPv6 or IPvFuture + Ok((input, host.unwrap(), true)) + } else { + let (input, host) = alt((ipv4, take_while(is_ireg_name)))(input)?; + Ok((input, host, false)) + } +} + +fn iauthority(input: &str) -> IResult<&str, &str, &str, &str, bool> { + let (input, userinfo) = opt(take_while(is_userinfo), char('@'))(input); + let (input, host, is_ipv6) = parse_host(input); + let (input, port) = preceded(char(':'), take_while(is_digit))(input); + Ok((input, userinfo, host, port, is_ipv6)) +} + +fn ihier_part(input: &str) -> IResult<&str, &str, &str> { + let (input, authority) = preceded(tag("//"), iauthoriy)(input); + let (input, path) = alt( + take_while(is_ipath_abempty), + char(''), // ipath-empty + take_while(is_ipath_absolute), + take_while(is_ipath_rootless) + )(input); + Ok((input, authority, path)) } fn is_ipchar(c: char) -> bool { @@ -124,11 +181,24 @@ fn scheme(input: &str) -> IResult<&str, &str> { take_while(is_scheme)(input) } +fn is_alphanum_or_hyphen_minus(char: char) -> bool { + match char { + '-' => true, + _ => char.is_alphanum(), + } +} + fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { let (input, scheme) = scheme(input)?; - let (input, (authority, path)) = ihier_part(input)?; - let (input, (_, query)) = opt(tuple(char('?'), take_while(is_query)))(input)?; - let (input, (_, fragment)) = opt(tuple(char('#'), take_while(is_ifragment)))(input)?; - - + let (input, (userinfo, hostport, is_ipv6), path) = ihier_part(input)?; + let (input, query) = opt(preceed(char('?'), take_while(is_query)))(input)?; + let (input, fragment) = opt(preceed(char('#'), take_while(is_ifragment)))(input)?; + Element::Link { + destination: LinkDestination { + target: input, + hostname: Some(hostport), + punycode: None, + scheme: scheme + } + } } From 53979e91c0b0943395f2f25598595d35b931f9fa Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Wed, 17 Jan 2024 12:38:06 +0330 Subject: [PATCH 05/74] pretty pretty code --- src/parser/parse_from_text/hashtag_content_char_ranges.rs | 1 - src/parser/parse_from_text/text_elements.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index f093bd1..9bce303 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -867,7 +867,6 @@ const HASHTAG_CONTENT_CHAR_RANGES: [RangeInclusive; NUMBER_OF_RANGES] = [ 0xe0100..=0xe01ef, ]; - pub(crate) fn hashtag_content_char(c: char) -> bool { if matches!(c, '#' | '﹟' | '#' | ' ') { false diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 476ea31..0c8672f 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -98,7 +98,6 @@ fn not_link_part_char(c: char) -> bool { !matches!(c, ':' | '\n' | '\r' | '\t' | ' ') } - fn link(input: &str) -> IResult<&str, (), CustomError<&str>> { let (input, _) = take_while1(link_scheme)(input)?; } From 133294aec39b0c1690991e0c71d4d6174462c3c1 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Thu, 18 Jan 2024 16:30:50 +0330 Subject: [PATCH 06/74] latest update --- src/parser/parse_from_text/link_element.rs | 43 +++++++++++----------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 543fd9f..a28648a 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -7,32 +7,16 @@ use nom::{ bytes::{ complete::{tag, take, take_while1, take_while}, }, - character, + character::{is_digit, is_alphabetic as is_alpha, is_hex_digit}, combinator::{peek, recognize, verify}, sequence::{tuple, preceded}, AsChar, IResult, - AsChar::is_dec_digit as is_digit }; use super::base_parsers::*; // Link syntax here is according to RFC 3986 & 3987 --Farooq -// In these fucntions checking for ranges, order is important. Remember that -// Rust does not check for the second condition in an AND compound boolean -// expression if the first is already false. Therefore, in is_alpha, I've put -// c >= 0x41 before c <= 0x5a as the first has a higher chance of failing. -// nom's own is_alpha is not used as it detects also chars outside the -// ASCII range -// -- Farooq -fn is_alpha(c: char) -> bool { - let c = c as u64; - // basically in inclusive ranges of [0x41, 0x5a] OR - // [0x61, 0x7a] - (c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a &&) -} - - // These ranges have been extracted from RFC3987, Page 8. const ucschar_ranges: [RangeInclusive, _] = [ @@ -91,7 +75,15 @@ fn is_ipv4(c: char) -> bool { } fn ipv4(input: &str) -> IResult<&str, &str> { - let (input, possible_ipv4) = take_while_m_n(7, 15, is_ipv4)(input); + let (input, possible_ipv4) = tuple( + complete::u8, + char('.'), + complete::u8, + char('.'), + complete::u8, + char('.'), + complete::u8 + )(input); // This might be an IPv4 let inner_pair = separated_pair(take_while1(is_digit), char('.'), take_while1(is_digit)); let ((part0, part1), (part2, part3)) = separated_pair(inner_pair, char('.'), inner_pair)(input)?; @@ -106,8 +98,17 @@ fn is_ireg_name(c: char) -> bool { is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) } +fn ipv6(input: &str) -> IResult<&str, &str> { + +} + +fn ipvfuture(input: &str) -> IResult<&str, &str> { + tuple(char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1 +} + + fn ip_literal(input: &str) -> IResult<&str, &str> { - + delimited(char('['), alt(ipv6, ipvfuture), char(']'))(input) } /// Parse host @@ -190,14 +191,14 @@ fn is_alphanum_or_hyphen_minus(char: char) -> bool { fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { let (input, scheme) = scheme(input)?; - let (input, (userinfo, hostport, is_ipv6), path) = ihier_part(input)?; + let (input, (userinfo, hostport, is_ipvfuture), path) = ihier_part(input)?; let (input, query) = opt(preceed(char('?'), take_while(is_query)))(input)?; let (input, fragment) = opt(preceed(char('#'), take_while(is_ifragment)))(input)?; Element::Link { destination: LinkDestination { target: input, hostname: Some(hostport), - punycode: None, + punycode: , scheme: scheme } } From fbb2652a81d171354c142486607cda8945c2c152 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 29 Jan 2024 17:33:32 +0330 Subject: [PATCH 07/74] up --- src/parser/parse_from_text/link_element.rs | 36 +++++++++++++++++----- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index a28648a..80c1e80 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -2,12 +2,11 @@ use crate::parser::link_url::LinkDestination; use std::ops::RangeInclusive; use super::Element; use crate::nom::{Offset, Slice}; -use nom::character::complete::char; use nom::{ bytes::{ complete::{tag, take, take_while1, take_while}, }, - character::{is_digit, is_alphabetic as is_alpha, is_hex_digit}, + character::{is_digit, is_alphabetic as is_alpha, is_hex_digit, char}, combinator::{peek, recognize, verify}, sequence::{tuple, preceded}, AsChar, IResult, @@ -98,12 +97,35 @@ fn is_ireg_name(c: char) -> bool { is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) } +fn h16(input: &str) -> IResult<&str, &str> { + take_while_m_n(1, 4, is_hex_digit) +} + +fn ls32(input: &str) -> IResult<&str, &str> { + alt(tuple(h16, char(':'), h16), ipv4) +} + fn ipv6(input: &str) -> IResult<&str, &str> { + let h16_and_period = tuple(h16, char(':')); + let double_period = tag("::"); + tuple( + take_while_m_n(6, 6, h16_and_period), + alt(ls32, double_period), + take_while(5, 5, h16_and_period), + alt(ls32, opt(h16)), + double_period, + take_while(4, 4, h16_and_period), + alt(ls32, opt(tuple(take_while_m_n(0, 1, h16_and_period) + +} + +fn is_ipvfuture_last(c: char) -> bool { + is_unreserved(c) || is_sub_delims(c) || c == ':' } fn ipvfuture(input: &str) -> IResult<&str, &str> { - tuple(char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1 + tuple(char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1, is_ipvfuture_last)) } @@ -131,20 +153,20 @@ fn parse_host(input: &str) -> IResult<&str, &str, bool> { fn iauthority(input: &str) -> IResult<&str, &str, &str, &str, bool> { let (input, userinfo) = opt(take_while(is_userinfo), char('@'))(input); - let (input, host, is_ipv6) = parse_host(input); + let (input, host, is_ipvfuture) = parse_host(input); let (input, port) = preceded(char(':'), take_while(is_digit))(input); Ok((input, userinfo, host, port, is_ipv6)) } -fn ihier_part(input: &str) -> IResult<&str, &str, &str> { - let (input, authority) = preceded(tag("//"), iauthoriy)(input); +fn ihier_part(input: &str) -> IResult<&str, &str, &str, bool> { + let (input, authority) = preceded(tag("//"), iauthority)(input); let (input, path) = alt( take_while(is_ipath_abempty), char(''), // ipath-empty take_while(is_ipath_absolute), take_while(is_ipath_rootless) )(input); - Ok((input, authority, path)) + Ok((input, authority, path, is_ipvfuture)) } fn is_ipchar(c: char) -> bool { From cf907866527d5a7dabc20c1e25903f1ad08132f6 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sat, 10 Feb 2024 22:13:59 +0330 Subject: [PATCH 08/74] some fixes --- src/parser/parse_from_text/find_range.rs | 6 +- .../hashtag_content_char_ranges.rs | 5 +- src/parser/parse_from_text/link_element.rs | 117 ++++++++++-------- .../parse_from_text/markdown_elements.rs | 4 +- src/parser/parse_from_text/mod.rs | 2 + src/parser/parse_from_text/text_elements.rs | 4 +- 6 files changed, 78 insertions(+), 60 deletions(-) diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs index 840a081..ecaadad 100644 --- a/src/parser/parse_from_text/find_range.rs +++ b/src/parser/parse_from_text/find_range.rs @@ -22,16 +22,16 @@ enum FindRangeResult<'a> { /// /// - `ranges` a refernce to a slice of `RangeInclusive` fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRangeResult<'a> { - let index = HASHTAG_CONTENT_CHAR_RANGES.binary_search_by_key(&code, |range| *range.start()); + let index = ranges.binary_search_by_key(&code, |range| *range.start()); match index { Ok(_) => FindRangeResult::WasOnRangeStart, Err(index) => match index { - 0 => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[0]), + 0 => FindRangeResult::Range(&ranges[0]), // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. // Therefore, `index - 1` will never panic if we index the array with it. #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] - index => FindRangeResult::Range(&HASHTAG_CONTENT_CHAR_RANGES[index - 1]), + index => FindRangeResult::Range(&ranges[index - 1]), }, } } diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index 9bce303..a6efa44 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -1,3 +1,6 @@ +use std::ops::RangeInclusive; +use crate::parser::parse_from_text::find_range::is_in_one_of_ranges; + const NUMBER_OF_RANGES: usize = 850; /* @@ -873,7 +876,7 @@ pub(crate) fn hashtag_content_char(c: char) -> bool { } else if matches!(c, '+' | '-' | '_') { true } else { - is_in_one_of_ranges(c, &[HASHTAG_CONTENT_CHAR_RANGES]) + is_in_one_of_ranges(c, &HASHTAG_CONTENT_CHAR_RANGES[..]) } } diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 80c1e80..64b47af 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -6,9 +6,10 @@ use nom::{ bytes::{ complete::{tag, take, take_while1, take_while}, }, - character::{is_digit, is_alphabetic as is_alpha, is_hex_digit, char}, + character::{is_alphabetic as is_alpha, char}, combinator::{peek, recognize, verify}, sequence::{tuple, preceded}, + multi::{many_m_n, count}, AsChar, IResult, }; use super::base_parsers::*; @@ -16,9 +17,16 @@ use super::base_parsers::*; // Link syntax here is according to RFC 3986 & 3987 --Farooq +fn is_hex_digit(c: char) -> bool { + c.is_ascii_hexdigit() +} + +fn is_digit(c: char) -> bool { + c.is_digit() +} // These ranges have been extracted from RFC3987, Page 8. -const ucschar_ranges: [RangeInclusive, _] = [ +const ucschar_ranges: [RangeInclusive; 17] = [ 0xa0..=0xd7ff, 0xF900..=0xFDCF, 0xFDF0..=0xFFEF, @@ -74,7 +82,7 @@ fn is_ipv4(c: char) -> bool { } fn ipv4(input: &str) -> IResult<&str, &str> { - let (input, possible_ipv4) = tuple( + let (input, ipv4_) = recognize(tuple(( complete::u8, char('.'), complete::u8, @@ -82,15 +90,8 @@ fn ipv4(input: &str) -> IResult<&str, &str> { complete::u8, char('.'), complete::u8 - )(input); - // This might be an IPv4 - let inner_pair = separated_pair(take_while1(is_digit), char('.'), take_while1(is_digit)); - let ((part0, part1), (part2, part3)) = separated_pair(inner_pair, char('.'), inner_pair)(input)?; - part0.parse::()?; - part1.parse::()?; - part2.parse::()?; - part3.parse::()?; - Ok((input, possible_ipv4)) + )))(input)?; + Ok((input, ipv4_)) } fn is_ireg_name(c: char) -> bool { @@ -98,25 +99,32 @@ fn is_ireg_name(c: char) -> bool { } fn h16(input: &str) -> IResult<&str, &str> { - take_while_m_n(1, 4, is_hex_digit) + take_while_m_n(1, 4, is_hex_digit)(input) } fn ls32(input: &str) -> IResult<&str, &str> { - alt(tuple(h16, char(':'), h16), ipv4) + alt((tuple((h16, char(':'), h16)), ipv4))(input) } -fn ipv6(input: &str) -> IResult<&str, &str> { - let h16_and_period = tuple(h16, char(':')); - let double_period = tag("::"); - tuple( - take_while_m_n(6, 6, h16_and_period), - alt(ls32, double_period), - take_while(5, 5, h16_and_period), - alt(ls32, opt(h16)), - double_period, - take_while(4, 4, h16_and_period), - alt(ls32, opt(tuple(take_while_m_n(0, 1, h16_and_period) +fn h16_and_period(input: &str) -> IResult<&str, &str> { + tuple((h16, char(':')))(input) +} + +fn double_period(input: &str) -> IResult<&str, &str> { + tag("::")(input) +} +fn ipv6(input: &str) -> IResult<&str, &str> { + alt(( + recognize(tuple((count(h16_and_period, 6), ls32))), + recognize(tuple((double_period, many_m_n(5, 5, h16_and_period), ls32))), + recognize(tuple((opt(h16), double_period, many_m_n(4, 4, h16_and_period), ls32))), + recognize(tuple((opt(tuple((many_m_n(0, 1, h16_and_period), ))), double_period, count(h16_and_period, 3), ls32))), + recognize(tuple((opt(tuple((many_m_n(0, 2, h16_and_period), h16))), double_period, count(h16_and_period, 2), ls32))), + recognize(tuple((opt(tuple((many_m_n(0, 3, h16_and_period), h16))), double_period, count(h16_and_period, 1), ls32))), + recognize(tuple((opt(tuple((many_m_n(0, 4, h16_and_period), h16))), double_period, ls32))), + recognize(tuple((opt(tuple((many_m_n(0, 5, h16_and_period), h16))), double_period, h16))), + recognize(tuple((opt(tuple((many_m_n(0, 6, h16_and_period), h16))), double_period)))))(input) } @@ -125,10 +133,9 @@ fn is_ipvfuture_last(c: char) -> bool { } fn ipvfuture(input: &str) -> IResult<&str, &str> { - tuple(char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1, is_ipvfuture_last)) + tuple((char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1, is_ipvfuture_last)))(input) } - fn ip_literal(input: &str) -> IResult<&str, &str> { delimited(char('['), alt(ipv6, ipvfuture), char(']'))(input) } @@ -139,41 +146,42 @@ fn ip_literal(input: &str) -> IResult<&str, &str> { /// /// Parse host. Returns the rest, the host string and a boolean indicating /// if it is IPvFuture or IPv6. -fn parse_host(input: &str) -> IResult<&str, &str, bool> { - let (input, host) = ip_literal(input)?; - if host.is_some() { - // It got parsed, then it's an IP Literal meaning - // it's either IPv6 or IPvFuture - Ok((input, host.unwrap(), true)) - } else { - let (input, host) = alt((ipv4, take_while(is_ireg_name)))(input)?; - Ok((input, host, false)) +fn parse_host(input: &str) -> IResult<&str, (&str, bool)> { + match ip_literal(input) { + Ok((input, host)) => { + // It got parsed, then it's an IP Literal meaning + // it's either IPv6 or IPvFuture + Ok((input, (host, true))) + } + Err(..) => { + let (input, host) = alt((ipv4, take_while(is_ireg_name)))(input)?; + Ok((input, (host, false))) + } } } -fn iauthority(input: &str) -> IResult<&str, &str, &str, &str, bool> { - let (input, userinfo) = opt(take_while(is_userinfo), char('@'))(input); - let (input, host, is_ipvfuture) = parse_host(input); - let (input, port) = preceded(char(':'), take_while(is_digit))(input); - Ok((input, userinfo, host, port, is_ipv6)) +fn iauthority(input: &str) -> IResult<&str, (&str, &str, &str, bool)> { + let (input, userinfo) = opt(take_while(is_userinfo), char('@'))(input)?; + let (input, (host, is_ipv6_or_future)) = parse_host(input)?; + let (input, port) = preceded(char(':'), take_while(is_digit))(input)?; + Ok((input, (userinfo, host, port, is_ipv6_or_future))) } -fn ihier_part(input: &str) -> IResult<&str, &str, &str, bool> { - let (input, authority) = preceded(tag("//"), iauthority)(input); - let (input, path) = alt( +fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { + let (input, (userinfo, host, port, is_ipv6_or_future)) = preceded(tag("//"), iauthority)(input)?; + let (input, path) = opt(alt( take_while(is_ipath_abempty), - char(''), // ipath-empty take_while(is_ipath_absolute), take_while(is_ipath_rootless) - )(input); - Ok((input, authority, path, is_ipvfuture)) + ))(input)?; + Ok((input, (userinfo, host, port, path, is_ipv6_or_future))) } fn is_ipchar(c: char) -> bool { is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) || matches!(c, ':' | '@') } -const IPRIVATE_RANGES: [RangeInclusive; _] = [ +const IPRIVATE_RANGES: [RangeInclusive; 3] = [ 0xe000..=0xf8ff, 0xf0000..=0xffffd, 0x100000..=0x10fffd, @@ -211,17 +219,18 @@ fn is_alphanum_or_hyphen_minus(char: char) -> bool { } } -fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { +pub fn link(input: &str) -> IResult<&str, Element> { let (input, scheme) = scheme(input)?; - let (input, (userinfo, hostport, is_ipvfuture), path) = ihier_part(input)?; + let (input, (userinfo, host, port, path, is_ipv6_or_future)) = ihier_part(input)?; let (input, query) = opt(preceed(char('?'), take_while(is_query)))(input)?; let (input, fragment) = opt(preceed(char('#'), take_while(is_ifragment)))(input)?; - Element::Link { + let mut s = format!("{scheme}://{userinfo}@{host}:{port}{path}{query}{fragment}"); + Ok((input, Element::Link { destination: LinkDestination { - target: input, + target: &s, hostname: Some(hostport), - punycode: , + punycode: None, scheme: scheme } - } + })) } diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 06fbd1e..c3cb3ab 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -1,6 +1,8 @@ use crate::parser::parse_from_text::text_elements::email_address; -use super::text_elements::{link, parse_text_element}; +use crate::parser::link_url::LinkDestination; +use super::text_elements::parse_text_element; +use crate::parser::parse_from_text::link_element::link; use super::Element; use super::{base_parsers::*, parse_all}; ///! nom parsers for markdown elements diff --git a/src/parser/parse_from_text/mod.rs b/src/parser/parse_from_text/mod.rs index a3180f4..49a7ea7 100644 --- a/src/parser/parse_from_text/mod.rs +++ b/src/parser/parse_from_text/mod.rs @@ -5,6 +5,8 @@ mod desktop_subset; pub mod hashtag_content_char_ranges; mod markdown_elements; mod text_elements; +mod link_element; +mod find_range; /// parses text elements such as links and email addresses, excluding markdown pub(crate) fn parse_only_text(input: &str) -> std::vec::Vec { diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 0c8672f..6930fb4 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -2,6 +2,7 @@ use crate::parser::link_url::LinkDestination; use super::base_parsers::*; +use super::link_element::link; use super::hashtag_content_char_ranges::hashtag_content_char; use super::Element; use crate::nom::{Offset, Slice}; @@ -98,6 +99,7 @@ fn not_link_part_char(c: char) -> bool { !matches!(c, ':' | '\n' | '\r' | '\t' | ' ') } +/* fn link(input: &str) -> IResult<&str, (), CustomError<&str>> { let (input, _) = take_while1(link_scheme)(input)?; } @@ -229,7 +231,7 @@ pub(crate) fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { Err(nom::Err::Error(CustomError::InvalidLink)) } } - +*/ fn is_allowed_bot_cmd_suggestion_char(char: char) -> bool { match char { '@' | '\\' | '_' | '/' | '.' | '-' => true, From ed94a504a6669f398364f24564ef891e75e8d27c Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 11 Feb 2024 15:54:02 +0330 Subject: [PATCH 09/74] fixed many issues --- src/parser/parse_from_text/base_parsers.rs | 2 +- src/parser/parse_from_text/desktop_subset.rs | 3 + src/parser/parse_from_text/find_range.rs | 10 +- .../hashtag_content_char_ranges.rs | 2 +- src/parser/parse_from_text/link_element.rs | 171 +++++++++++------- .../parse_from_text/markdown_elements.rs | 7 +- src/parser/parse_from_text/mod.rs | 4 +- src/parser/parse_from_text/text_elements.rs | 3 +- 8 files changed, 127 insertions(+), 75 deletions(-) diff --git a/src/parser/parse_from_text/base_parsers.rs b/src/parser/parse_from_text/base_parsers.rs index 9881d36..cc88ea0 100644 --- a/src/parser/parse_from_text/base_parsers.rs +++ b/src/parser/parse_from_text/base_parsers.rs @@ -3,7 +3,7 @@ use std::fmt::Debug; ///! Base utility parsers, used by both text and markdown parsers use nom::{ bytes::complete::tag, - error::{ErrorKind, ParseError}, + error::{Error, ErrorKind, ParseError}, sequence::delimited, IResult, }; diff --git a/src/parser/parse_from_text/desktop_subset.rs b/src/parser/parse_from_text/desktop_subset.rs index 14fbab4..438f0db 100644 --- a/src/parser/parse_from_text/desktop_subset.rs +++ b/src/parser/parse_from_text/desktop_subset.rs @@ -2,6 +2,9 @@ //! and also we can keep delimited and labled links in desktop use super::base_parsers::*; +use super::base_parsers::{ + direct_delimited, is_white_space, is_white_space_but_not_linebreak, CustomError, +}; use super::markdown_elements::{delimited_email_address, delimited_link, labeled_link}; use super::text_elements::parse_text_element; use super::Element; diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs index ecaadad..77f8ba1 100644 --- a/src/parser/parse_from_text/find_range.rs +++ b/src/parser/parse_from_text/find_range.rs @@ -6,20 +6,19 @@ enum FindRangeResult<'a> { Range(&'a RangeInclusive), } - /// Find a range which `code` might be in it. -/// -/// # Description +/// +/// # Description /// This function gets a sorted slice of inclusive u32 ranges, performs /// binary search on them and returns a FindRangeResult enum telling /// which range the `code` might be in. It returns `FindRangeResult::WasOnRangeStart` /// if the code was exactly on start of a range. Or a `FindRangeResult::Range(range)` /// which indicates `code` is in `range` or in no ranges. -/// +/// /// # Arguments /// /// - `code` the u32 to look for a range for. -/// +/// /// - `ranges` a refernce to a slice of `RangeInclusive` fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRangeResult<'a> { let index = ranges.binary_search_by_key(&code, |range| *range.start()); @@ -36,7 +35,6 @@ fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRan } } - /// Returns true of `c` is one of the `ranges`, false otherwise. /// /// # Arguments diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index a6efa44..1c934fe 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -1,5 +1,5 @@ -use std::ops::RangeInclusive; use crate::parser::parse_from_text::find_range::is_in_one_of_ranges; +use std::ops::RangeInclusive; const NUMBER_OF_RANGES: usize = 850; diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 64b47af..17b934b 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -1,22 +1,24 @@ -use crate::parser::link_url::LinkDestination; -use std::ops::RangeInclusive; +use super::base_parsers::*; +use super::find_range::is_in_one_of_ranges; use super::Element; use crate::nom::{Offset, Slice}; +use crate::parser::link_url::LinkDestination; use nom::{ - bytes::{ - complete::{tag, take, take_while1, take_while}, + branch::alt, + bytes::complete::{tag, take, take_while, take_while1, take_while_m_n}, + character::{ + is_alphabetic as is_alpha, + complete::{char, u8} }, - character::{is_alphabetic as is_alpha, char}, - combinator::{peek, recognize, verify}, - sequence::{tuple, preceded}, - multi::{many_m_n, count}, + combinator::{opt, peek, recognize, verify}, + multi::{count, many0, many_m_n}, + sequence::{delimited, preceded, tuple}, AsChar, IResult, }; -use super::base_parsers::*; +use std::ops::RangeInclusive; // Link syntax here is according to RFC 3986 & 3987 --Farooq - fn is_hex_digit(c: char) -> bool { c.is_ascii_hexdigit() } @@ -26,7 +28,7 @@ fn is_digit(c: char) -> bool { } // These ranges have been extracted from RFC3987, Page 8. -const ucschar_ranges: [RangeInclusive; 17] = [ +const UCSCHAR_RANGES: [RangeInclusive; 17] = [ 0xa0..=0xd7ff, 0xF900..=0xFDCF, 0xFDF0..=0xFFEF, @@ -47,7 +49,7 @@ const ucschar_ranges: [RangeInclusive; 17] = [ ]; fn is_ucschar(c: char) -> bool { - is_in_one_of_ranges(c, &ucschar_ranges[..]) + is_in_one_of_ranges(c, &UCSCHAR_RANGES[..]) } fn is_unreserved(c: char) -> bool { @@ -67,7 +69,10 @@ fn is_pct_encoded(c: [char; 3]) -> bool { } fn is_sub_delim(c: char) -> bool { - matches!(c, '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=') + matches!( + c, + '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' + ) } // Here again, order is important. As URLs/IRIs have letters in them @@ -76,26 +81,18 @@ fn is_scheme(c: char) -> bool { is_alpha(c) || is_digit(c) || is_scheme(c) } - fn is_ipv4(c: char) -> bool { is_digit(c) || c == '.' } fn ipv4(input: &str) -> IResult<&str, &str> { - let (input, ipv4_) = recognize(tuple(( - complete::u8, - char('.'), - complete::u8, - char('.'), - complete::u8, - char('.'), - complete::u8 - )))(input)?; + let (input, ipv4_) = + recognize(tuple((u8, char('.'), u8, char('.'), u8, char('.'), u8)))(input)?; Ok((input, ipv4_)) } fn is_ireg_name(c: char) -> bool { - is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) + is_iunreserved(c) || is_pct_encoded(c) || is_sub_delim(c) } fn h16(input: &str) -> IResult<&str, &str> { @@ -118,32 +115,68 @@ fn ipv6(input: &str) -> IResult<&str, &str> { alt(( recognize(tuple((count(h16_and_period, 6), ls32))), recognize(tuple((double_period, many_m_n(5, 5, h16_and_period), ls32))), - recognize(tuple((opt(h16), double_period, many_m_n(4, 4, h16_and_period), ls32))), - recognize(tuple((opt(tuple((many_m_n(0, 1, h16_and_period), ))), double_period, count(h16_and_period, 3), ls32))), - recognize(tuple((opt(tuple((many_m_n(0, 2, h16_and_period), h16))), double_period, count(h16_and_period, 2), ls32))), - recognize(tuple((opt(tuple((many_m_n(0, 3, h16_and_period), h16))), double_period, count(h16_and_period, 1), ls32))), - recognize(tuple((opt(tuple((many_m_n(0, 4, h16_and_period), h16))), double_period, ls32))), - recognize(tuple((opt(tuple((many_m_n(0, 5, h16_and_period), h16))), double_period, h16))), - recognize(tuple((opt(tuple((many_m_n(0, 6, h16_and_period), h16))), double_period)))))(input) + recognize(tuple(( + opt(h16), + double_period, + many_m_n(4, 4, h16_and_period), + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 1, h16_and_period),))), + double_period, + count(h16_and_period, 3), + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 2, h16_and_period), h16))), + double_period, + count(h16_and_period, 2), + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 3, h16_and_period), h16))), + double_period, + count(h16_and_period, 1), + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 4, h16_and_period), h16))), + double_period, + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 5, h16_and_period), h16))), + double_period, + h16, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 6, h16_and_period), h16))), + double_period, + ))), + ))(input) } - fn is_ipvfuture_last(c: char) -> bool { - is_unreserved(c) || is_sub_delims(c) || c == ':' + is_unreserved(c) || is_sub_delim(c) || c == ':' } fn ipvfuture(input: &str) -> IResult<&str, &str> { - tuple((char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1, is_ipvfuture_last)))(input) + tuple(( + char('v'), + take_while_m_n(1, 1, is_hex_digit), + char('.'), + take_while_m_n(1, 1, is_ipvfuture_last), + ))(input) } fn ip_literal(input: &str) -> IResult<&str, &str> { - delimited(char('['), alt(ipv6, ipvfuture), char(']'))(input) + delimited(char('['), alt((ipv6, ipvfuture)), char(']'))(input) } /// Parse host /// /// # Description -/// +/// /// Parse host. Returns the rest, the host string and a boolean indicating /// if it is IPvFuture or IPv6. fn parse_host(input: &str) -> IResult<&str, (&str, bool)> { @@ -160,35 +193,46 @@ fn parse_host(input: &str) -> IResult<&str, (&str, bool)> { } } +fn is_userinfo(c: char) -> bool { + is_iunreserved(c) || is_pct_encoded(c) || is_sub_delim(c) +} + fn iauthority(input: &str) -> IResult<&str, (&str, &str, &str, bool)> { - let (input, userinfo) = opt(take_while(is_userinfo), char('@'))(input)?; + let (input, userinfo) = opt(tuple((take_while(is_userinfo), char('@'))))(input)?; let (input, (host, is_ipv6_or_future)) = parse_host(input)?; let (input, port) = preceded(char(':'), take_while(is_digit))(input)?; + let userinfo = userinfo.unwrap_or(""); Ok((input, (userinfo, host, port, is_ipv6_or_future))) } fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { - let (input, (userinfo, host, port, is_ipv6_or_future)) = preceded(tag("//"), iauthority)(input)?; - let (input, path) = opt(alt( - take_while(is_ipath_abempty), - take_while(is_ipath_absolute), - take_while(is_ipath_rootless) - ))(input)?; + let (input, (userinfo, host, port, is_ipv6_or_future)) = + preceded(tag("//"), iauthority)(input)?; + let (input, path) = opt(alt(( + recognize(tuple(( + char('/'), + opt(tuple(( + take_while1(is_ipchar), + many0(tuple((char('/'), take_while(is_ipchar)))), + ))), + ))), // ipath-absolute + recognize(tuple(( + take_while1(is_ipchar), + many0(tuple((char('/'), take_while(is_ipchar)))), + ))), // ipath_rootless + )))(input)?; + let path = path.unwrap_or(""); // it's ipath_empty Ok((input, (userinfo, host, port, path, is_ipv6_or_future))) } fn is_ipchar(c: char) -> bool { - is_iunreserved(c) || is_pct_encoded(c) || is_sub_delims(c) || matches!(c, ':' | '@') + is_iunreserved(c) || is_pct_encoded(c) || is_sub_delim(c) || matches!(c, ':' | '@') } -const IPRIVATE_RANGES: [RangeInclusive; 3] = [ - 0xe000..=0xf8ff, - 0xf0000..=0xffffd, - 0x100000..=0x10fffd, -]; +const IPRIVATE_RANGES: [RangeInclusive; 3] = + [0xe000..=0xf8ff, 0xf0000..=0xffffd, 0x100000..=0x10fffd]; fn is_iprivate(c: char) -> bool { - let c = c as u32; is_in_one_of_ranges(c, &IPRIVATE_RANGES[..]) } @@ -205,7 +249,7 @@ fn is_ifragment(c: char) -> bool { } fn ifragment(input: &str) -> IResult<&str, &str> { - take_while(is_fragment)(input) + take_while(is_ifragment)(input) } fn scheme(input: &str) -> IResult<&str, &str> { @@ -222,15 +266,18 @@ fn is_alphanum_or_hyphen_minus(char: char) -> bool { pub fn link(input: &str) -> IResult<&str, Element> { let (input, scheme) = scheme(input)?; let (input, (userinfo, host, port, path, is_ipv6_or_future)) = ihier_part(input)?; - let (input, query) = opt(preceed(char('?'), take_while(is_query)))(input)?; - let (input, fragment) = opt(preceed(char('#'), take_while(is_ifragment)))(input)?; - let mut s = format!("{scheme}://{userinfo}@{host}:{port}{path}{query}{fragment}"); - Ok((input, Element::Link { - destination: LinkDestination { - target: &s, - hostname: Some(hostport), - punycode: None, - scheme: scheme - } - })) + let (input, Some(query)) = opt(preceded(char('?'), take_while(is_iquery)))(input)?; + let (input, Some(fragment)) = opt(preceded(char('#'), take_while(is_ifragment)))(input)?; + let mut s = format!("{scheme}://{userinfo}@{host}:{port}{path}?{query}#{fragment}"); + Ok(( + input, + Element::Link { + destination: LinkDestination { + target: &s, + hostname: Some(host), + punycode: None, + scheme: scheme, + }, + }, + )) } diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index c3cb3ab..b5dd8f8 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -1,10 +1,13 @@ use crate::parser::parse_from_text::text_elements::email_address; -use crate::parser::link_url::LinkDestination; +use super::base_parsers::{ + direct_delimited, is_white_space, is_white_space_but_not_linebreak, CustomError, +}; use super::text_elements::parse_text_element; -use crate::parser::parse_from_text::link_element::link; use super::Element; use super::{base_parsers::*, parse_all}; +use crate::parser::link_url::LinkDestination; +use crate::parser::parse_from_text::link_element::link; ///! nom parsers for markdown elements use nom::{ bytes::complete::{is_not, tag, take, take_while}, diff --git a/src/parser/parse_from_text/mod.rs b/src/parser/parse_from_text/mod.rs index 49a7ea7..ed5e7fd 100644 --- a/src/parser/parse_from_text/mod.rs +++ b/src/parser/parse_from_text/mod.rs @@ -2,11 +2,11 @@ use super::Element; pub(crate) mod base_parsers; mod desktop_subset; +mod find_range; pub mod hashtag_content_char_ranges; +mod link_element; mod markdown_elements; mod text_elements; -mod link_element; -mod find_range; /// parses text elements such as links and email addresses, excluding markdown pub(crate) fn parse_only_text(input: &str) -> std::vec::Vec { diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 6930fb4..a04d040 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -1,9 +1,10 @@ ///! nom parsers for text elements use crate::parser::link_url::LinkDestination; +use super::base_parsers::CustomError; use super::base_parsers::*; -use super::link_element::link; use super::hashtag_content_char_ranges::hashtag_content_char; +use super::link_element::link; use super::Element; use crate::nom::{Offset, Slice}; use nom::bytes::complete::take_while; From 87993c233584c5ad40be3b71a43768044985e1b1 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 11 Feb 2024 16:09:54 +0330 Subject: [PATCH 10/74] some more fixes --- src/parser/parse_from_text/link_element.rs | 22 +++++++++---------- .../parse_from_text/markdown_elements.rs | 7 +++++- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 17b934b..5b5f4e6 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -64,10 +64,6 @@ fn is_other_unreserved(c: char) -> bool { matches!(c, '_' | '.' | '_' | '~') } -fn is_pct_encoded(c: [char; 3]) -> bool { - c[0] == '%' && is_hex_digit(c[1]) && is_hex_digit(c[2]) -} - fn is_sub_delim(c: char) -> bool { matches!( c, @@ -198,7 +194,7 @@ fn is_userinfo(c: char) -> bool { } fn iauthority(input: &str) -> IResult<&str, (&str, &str, &str, bool)> { - let (input, userinfo) = opt(tuple((take_while(is_userinfo), char('@'))))(input)?; + let (input, userinfo) = opt(recognize(tuple((take_while(is_userinfo), char('@')))))(input)?; let (input, (host, is_ipv6_or_future)) = parse_host(input)?; let (input, port) = preceded(char(':'), take_while(is_digit))(input)?; let userinfo = userinfo.unwrap_or(""); @@ -225,8 +221,8 @@ fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { Ok((input, (userinfo, host, port, path, is_ipv6_or_future))) } -fn is_ipchar(c: char) -> bool { - is_iunreserved(c) || is_pct_encoded(c) || is_sub_delim(c) || matches!(c, ':' | '@') +fn is_ipchar_not_pct_encoded(c: char) -> bool { + is_iunreserved(c) || is_sub_delim(c) || matches!(c, ':' | '@') } const IPRIVATE_RANGES: [RangeInclusive; 3] = @@ -236,12 +232,12 @@ fn is_iprivate(c: char) -> bool { is_in_one_of_ranges(c, &IPRIVATE_RANGES[..]) } -fn is_iquery(c: char) -> bool { - is_iprivate(c) || is_ipchar(c) || matches!(c, '/' | '?') +fn is_iquery_not_pct_encoded(c: char) -> bool { + is_iprivate(c) || is_ipchar_not_pct_encoded(c) || matches!(c, '/' | '?') } fn iquery(input: &str) -> IResult<&str, &str> { - take_while(is_iquery)(input) + recognize(many0(alt((take_while(is_iquery_not_pct_encoded), pct_encoded))))(input) } fn is_ifragment(c: char) -> bool { @@ -263,10 +259,14 @@ fn is_alphanum_or_hyphen_minus(char: char) -> bool { } } +fn pct_encoded(input: &str) -> IResult<&str, &str> { + recognize(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit))))(input) +} + pub fn link(input: &str) -> IResult<&str, Element> { let (input, scheme) = scheme(input)?; let (input, (userinfo, host, port, path, is_ipv6_or_future)) = ihier_part(input)?; - let (input, Some(query)) = opt(preceded(char('?'), take_while(is_iquery)))(input)?; + let (input, Some(query)) = opt(preceded(char('?'), iquery))(input)?; let (input, Some(fragment)) = opt(preceded(char('#'), take_while(is_ifragment)))(input)?; let mut s = format!("{scheme}://{userinfo}@{host}:{port}{path}?{query}#{fragment}"); Ok(( diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index b5dd8f8..1726e63 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -98,7 +98,12 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< if content.is_empty() { return Err(nom::Err::Error(CustomError::NoContent)); } - let (rest, link) = link(content)?; + let (rest, link) = match link(content) { + Ok((rest, link)) => (rest, link), + Err(nom::Err(err)) => { + return Err(Error(CustomError::Nom(err.input, err.code))); + } + } if !rest.is_empty() { return Err(nom::Err::Error(CustomError::UnexpectedContent)); } From b3a0f71b8c7d47f7fa1a456862eb949a2576c5d5 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sat, 17 Feb 2024 15:02:14 +0330 Subject: [PATCH 11/74] some fixes --- src/parser/parse_from_text/link_element.rs | 34 +++++++++++++------ .../parse_from_text/markdown_elements.rs | 4 +-- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 5b5f4e6..181ef8a 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -6,10 +6,7 @@ use crate::parser::link_url::LinkDestination; use nom::{ branch::alt, bytes::complete::{tag, take, take_while, take_while1, take_while_m_n}, - character::{ - is_alphabetic as is_alpha, - complete::{char, u8} - }, + character::complete::{char, u8}, combinator::{opt, peek, recognize, verify}, multi::{count, many0, many_m_n}, sequence::{delimited, preceded, tuple}, @@ -19,12 +16,16 @@ use std::ops::RangeInclusive; // Link syntax here is according to RFC 3986 & 3987 --Farooq +fn is_alpha(c: char) -> bool{ + c.is_alphabetic() +} + fn is_hex_digit(c: char) -> bool { c.is_ascii_hexdigit() } fn is_digit(c: char) -> bool { - c.is_digit() + c.is_digit(10) } // These ranges have been extracted from RFC3987, Page 8. @@ -96,11 +97,16 @@ fn h16(input: &str) -> IResult<&str, &str> { } fn ls32(input: &str) -> IResult<&str, &str> { - alt((tuple((h16, char(':'), h16)), ipv4))(input) + let result = recognize(tuple((h16, char(':'), h16)))(input); + if result.is_err() { + ipv4(input) + } else { + result + } } fn h16_and_period(input: &str) -> IResult<&str, &str> { - tuple((h16, char(':')))(input) + recognize(tuple((h16, char(':'))))(input) } fn double_period(input: &str) -> IResult<&str, &str> { @@ -157,12 +163,12 @@ fn is_ipvfuture_last(c: char) -> bool { } fn ipvfuture(input: &str) -> IResult<&str, &str> { - tuple(( + recognize(tuple(( char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1, is_ipvfuture_last), - ))(input) + )))(input) } fn ip_literal(input: &str) -> IResult<&str, &str> { @@ -213,7 +219,7 @@ fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { ))), ))), // ipath-absolute recognize(tuple(( - take_while1(is_ipchar), + take_while_ipchar, many0(tuple((char('/'), take_while(is_ipchar)))), ))), // ipath_rootless )))(input)?; @@ -225,6 +231,14 @@ fn is_ipchar_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) || matches!(c, ':' | '@') } +fn take_while_ipchar(input: &str) -> IResult<&str, &str> { + many0(alt((take_while(is_ipchar_not_pct_encoded), take_while(is_pct_encoded)))(input) +} + +fn is_pct_encoded(c: [char; 3]) -> bool { + c[0] == '%' && is_hex_digit(c[1]) && is_hex_digit(c[2]) +} + const IPRIVATE_RANGES: [RangeInclusive; 3] = [0xe000..=0xf8ff, 0xf0000..=0xffffd, 0x100000..=0x10fffd]; diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 1726e63..2b93e0f 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -100,8 +100,8 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< } let (rest, link) = match link(content) { Ok((rest, link)) => (rest, link), - Err(nom::Err(err)) => { - return Err(Error(CustomError::Nom(err.input, err.code))); + Err(nom::Err::Error(err)) => { + return Err(nom::Err::Error(CustomError::Nom(err.input, err.code))); } } if !rest.is_empty() { From f8ba2fe78886f250e40ea8f0f24cf5aa7d11a29d Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sat, 17 Feb 2024 15:53:52 +0330 Subject: [PATCH 12/74] some more fixes --- src/parser/parse_from_text/link_element.rs | 24 +++++++++------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 181ef8a..b4d6ede 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -88,8 +88,8 @@ fn ipv4(input: &str) -> IResult<&str, &str> { Ok((input, ipv4_)) } -fn is_ireg_name(c: char) -> bool { - is_iunreserved(c) || is_pct_encoded(c) || is_sub_delim(c) +fn is_ireg_name_not_pct_encoded(c: char) -> bool { + is_iunreserved(c) || is_sub_delim(c) } fn h16(input: &str) -> IResult<&str, &str> { @@ -189,18 +189,18 @@ fn parse_host(input: &str) -> IResult<&str, (&str, bool)> { Ok((input, (host, true))) } Err(..) => { - let (input, host) = alt((ipv4, take_while(is_ireg_name)))(input)?; + let (input, host) = alt((ipv4, many0(alt((take_while(is_ireg_name_not_pct_encoded)))))(input)?; Ok((input, (host, false))) } } } -fn is_userinfo(c: char) -> bool { - is_iunreserved(c) || is_pct_encoded(c) || is_sub_delim(c) +fn is_userinfo_not_pct_encoded(c: char) -> bool { + is_iunreserved(c) || is_sub_delim(c) } fn iauthority(input: &str) -> IResult<&str, (&str, &str, &str, bool)> { - let (input, userinfo) = opt(recognize(tuple((take_while(is_userinfo), char('@')))))(input)?; + let (input, userinfo) = opt(recognize(alt((take_while_pct_encoded, tuple((take_while(is_userinfo), char('@')))))))(input)?; let (input, (host, is_ipv6_or_future)) = parse_host(input)?; let (input, port) = preceded(char(':'), take_while(is_digit))(input)?; let userinfo = userinfo.unwrap_or(""); @@ -215,12 +215,12 @@ fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { char('/'), opt(tuple(( take_while1(is_ipchar), - many0(tuple((char('/'), take_while(is_ipchar)))), + many0(tuple((char('/'), take_while_ipchar))), ))), ))), // ipath-absolute recognize(tuple(( take_while_ipchar, - many0(tuple((char('/'), take_while(is_ipchar)))), + many0(tuple((char('/'), take_while_ipchar))), ))), // ipath_rootless )))(input)?; let path = path.unwrap_or(""); // it's ipath_empty @@ -232,11 +232,7 @@ fn is_ipchar_not_pct_encoded(c: char) -> bool { } fn take_while_ipchar(input: &str) -> IResult<&str, &str> { - many0(alt((take_while(is_ipchar_not_pct_encoded), take_while(is_pct_encoded)))(input) -} - -fn is_pct_encoded(c: [char; 3]) -> bool { - c[0] == '%' && is_hex_digit(c[1]) && is_hex_digit(c[2]) + recognize(many0(alt((take_while(is_ipchar_not_pct_encoded), take_while_pct_encoded))))(input) } const IPRIVATE_RANGES: [RangeInclusive; 3] = @@ -273,7 +269,7 @@ fn is_alphanum_or_hyphen_minus(char: char) -> bool { } } -fn pct_encoded(input: &str) -> IResult<&str, &str> { +fn take_while_pct_encoded(input: &str) -> IResult<&str, &str> { recognize(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit))))(input) } From 8c225a1d38f92cd54ecd163f7df146dfbbe10cbd Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Wed, 21 Feb 2024 19:11:06 +0330 Subject: [PATCH 13/74] many other fixes --- src/parser/parse_from_text/link_element.rs | 36 ++++++++++++++-------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index b4d6ede..4b8c1b2 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -8,7 +8,7 @@ use nom::{ bytes::complete::{tag, take, take_while, take_while1, take_while_m_n}, character::complete::{char, u8}, combinator::{opt, peek, recognize, verify}, - multi::{count, many0, many_m_n}, + multi::{count, many0, many1, many_m_n}, sequence::{delimited, preceded, tuple}, AsChar, IResult, }; @@ -189,24 +189,36 @@ fn parse_host(input: &str) -> IResult<&str, (&str, bool)> { Ok((input, (host, true))) } Err(..) => { - let (input, host) = alt((ipv4, many0(alt((take_while(is_ireg_name_not_pct_encoded)))))(input)?; + let (input, host) = alt((ipv4, take_while_ireg))(input)?; Ok((input, (host, false))) } } } +fn take_while_ireg(input: &str) -> IResult<&str, &str> { + alt((recognize(many0(take_while_pct_encoded)), take_while(is_ireg_name_not_pct_encoded)))(input) +} + fn is_userinfo_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) } fn iauthority(input: &str) -> IResult<&str, (&str, &str, &str, bool)> { - let (input, userinfo) = opt(recognize(alt((take_while_pct_encoded, tuple((take_while(is_userinfo), char('@')))))))(input)?; + let (input, userinfo) = opt(recognize(tuple((take_while_iuserinfo, char('@')))))(input)?; let (input, (host, is_ipv6_or_future)) = parse_host(input)?; let (input, port) = preceded(char(':'), take_while(is_digit))(input)?; let userinfo = userinfo.unwrap_or(""); Ok((input, (userinfo, host, port, is_ipv6_or_future))) } +fn take_while_iuserinfo(input: &str) -> IResult<&str, &str> { + alt((recognize(many0(take_while_pct_encoded)), take_while(is_iuserinfo_not_pct_encoded)))(input) +} + +fn is_iuserinfo_not_pct_encoded(c: char) -> bool { + is_iunreserved(c) || is_sub_delim(c) || c == ':' +} + fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { let (input, (userinfo, host, port, is_ipv6_or_future)) = preceded(tag("//"), iauthority)(input)?; @@ -214,7 +226,7 @@ fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { recognize(tuple(( char('/'), opt(tuple(( - take_while1(is_ipchar), + take_while_ipchar1, many0(tuple((char('/'), take_while_ipchar))), ))), ))), // ipath-absolute @@ -235,6 +247,10 @@ fn take_while_ipchar(input: &str) -> IResult<&str, &str> { recognize(many0(alt((take_while(is_ipchar_not_pct_encoded), take_while_pct_encoded))))(input) } +fn take_while_ipchar1(input: &str) -> IResult<&str, &str> { + recognize(many1(alt((take_while(is_ipchar_not_pct_encoded), take_while_pct_encoded))))(input) +} + const IPRIVATE_RANGES: [RangeInclusive; 3] = [0xe000..=0xf8ff, 0xf0000..=0xffffd, 0x100000..=0x10fffd]; @@ -247,15 +263,11 @@ fn is_iquery_not_pct_encoded(c: char) -> bool { } fn iquery(input: &str) -> IResult<&str, &str> { - recognize(many0(alt((take_while(is_iquery_not_pct_encoded), pct_encoded))))(input) -} - -fn is_ifragment(c: char) -> bool { - is_ipchar(c) || matches!(c, '/' | '?') + recognize(many0(alt((take_while(is_iquery_not_pct_encoded), take_while_pct_encoded))))(input) } -fn ifragment(input: &str) -> IResult<&str, &str> { - take_while(is_ifragment)(input) +fn take_while_ifragment(input: &str) -> IResult<&str, &str> { + recognize(many0(alt((take_while_ipchar, take_while_pct_encoded, tag("/"), tag("?")))))(input) } fn scheme(input: &str) -> IResult<&str, &str> { @@ -277,7 +289,7 @@ pub fn link(input: &str) -> IResult<&str, Element> { let (input, scheme) = scheme(input)?; let (input, (userinfo, host, port, path, is_ipv6_or_future)) = ihier_part(input)?; let (input, Some(query)) = opt(preceded(char('?'), iquery))(input)?; - let (input, Some(fragment)) = opt(preceded(char('#'), take_while(is_ifragment)))(input)?; + let (input, Some(fragment)) = opt(preceded(char('#'), take_while_ifragment))(input)?; let mut s = format!("{scheme}://{userinfo}@{host}:{port}{path}?{query}#{fragment}"); Ok(( input, From d492bf0d5a81e1c673c81f4d8be5153cbec9ed26 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Wed, 21 Feb 2024 19:12:54 +0330 Subject: [PATCH 14/74] another fix --- src/parser/parse_from_text/link_element.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 4b8c1b2..5ebc9bc 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -286,16 +286,17 @@ fn take_while_pct_encoded(input: &str) -> IResult<&str, &str> { } pub fn link(input: &str) -> IResult<&str, Element> { - let (input, scheme) = scheme(input)?; - let (input, (userinfo, host, port, path, is_ipv6_or_future)) = ihier_part(input)?; - let (input, Some(query)) = opt(preceded(char('?'), iquery))(input)?; - let (input, Some(fragment)) = opt(preceded(char('#'), take_while_ifragment))(input)?; + let input_ = <&str>::clone(input); + let (input_, scheme) = scheme(input)?; + let (input_, (userinfo, host, port, path, is_ipv6_or_future)) = ihier_part(input)?; + let (input_, Some(query)) = opt(preceded(char('?'), iquery))(input)?; + let (input_, Some(fragment)) = opt(preceded(char('#'), take_while_ifragment))(input)?; let mut s = format!("{scheme}://{userinfo}@{host}:{port}{path}?{query}#{fragment}"); Ok(( - input, + input[0..s.len()], Element::Link { destination: LinkDestination { - target: &s, + target: , hostname: Some(host), punycode: None, scheme: scheme, From 28cbdf7808c7186cfa2a016935c5d5c7182c2033 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Thu, 22 Feb 2024 18:17:23 +0330 Subject: [PATCH 15/74] taking slices from input --- src/parser/parse_from_text/link_element.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 5ebc9bc..4905ed1 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -286,17 +286,17 @@ fn take_while_pct_encoded(input: &str) -> IResult<&str, &str> { } pub fn link(input: &str) -> IResult<&str, Element> { - let input_ = <&str>::clone(input); + let input_ = <&str>::clone(&input); let (input_, scheme) = scheme(input)?; let (input_, (userinfo, host, port, path, is_ipv6_or_future)) = ihier_part(input)?; let (input_, Some(query)) = opt(preceded(char('?'), iquery))(input)?; let (input_, Some(fragment)) = opt(preceded(char('#'), take_while_ifragment))(input)?; let mut s = format!("{scheme}://{userinfo}@{host}:{port}{path}?{query}#{fragment}"); Ok(( - input[0..s.len()], + &input[0..s.len()], Element::Link { destination: LinkDestination { - target: , + target: &input[(scheme.len() + 3)..(userinfo.len() + 2 + host.len() + port.len())], hostname: Some(host), punycode: None, scheme: scheme, From ee0611d226a79b75e246e6fd4851a04fa40bea65 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 5 Mar 2024 15:28:17 +0330 Subject: [PATCH 16/74] fix semicolon meaning --- src/parser/parse_from_text/markdown_elements.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 2b93e0f..cd82821 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -103,7 +103,7 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< Err(nom::Err::Error(err)) => { return Err(nom::Err::Error(CustomError::Nom(err.input, err.code))); } - } + }; if !rest.is_empty() { return Err(nom::Err::Error(CustomError::UnexpectedContent)); } From e3ac2ca7cf6172ee972ef5788640bdacff520699 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 5 Mar 2024 19:56:11 +0330 Subject: [PATCH 17/74] Fixing tons of things... --- src/parser/link_url.rs | 27 ++-- src/parser/parse_from_text/base_parsers.rs | 2 +- src/parser/parse_from_text/find_range.rs | 2 +- src/parser/parse_from_text/link_element.rs | 130 +++++++++++++----- .../parse_from_text/markdown_elements.rs | 6 + 5 files changed, 113 insertions(+), 54 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index ff9a57b..0ae9b06 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -38,9 +38,9 @@ pub struct LinkDestination<'a> { #[derive(Debug, PartialEq, Eq, Serialize)] pub struct PunycodeWarning { - original_hostname: String, - ascii_hostname: String, - punycode_encoded_url: String, + pub original_hostname: String, + pub ascii_hostname: String, + pub punycode_encoded_url: String, } /// determines which generic schemes (without '://') get linkifyed @@ -320,22 +320,11 @@ fn host<'a>(input: &'a str) -> IResult<&'a str, (&'a str, bool), LinkParseError< fn punycode_encode(host: &str) -> String { host.split('.') .map(|sub| { - let mut has_non_ascii_char = false; - for char in sub.chars() { - if !is_alphanum_or_hyphen_minus(char) { - has_non_ascii_char = true; - break; - } - } - if has_non_ascii_char { - format!( - "xn--{}", - unic_idna_punycode::encode_str(sub) - .unwrap_or_else(|| "[punycode encode failed]".to_owned()) - ) - } else { - sub.to_owned() - } + format!( + "xn--{}", + unic_idna_punycode::encode_str(sub) + .unwrap_or_else(|| "[punycode encode failed]".to_owned()) + ) }) .collect::>() .join(".") diff --git a/src/parser/parse_from_text/base_parsers.rs b/src/parser/parse_from_text/base_parsers.rs index cc88ea0..9881d36 100644 --- a/src/parser/parse_from_text/base_parsers.rs +++ b/src/parser/parse_from_text/base_parsers.rs @@ -3,7 +3,7 @@ use std::fmt::Debug; ///! Base utility parsers, used by both text and markdown parsers use nom::{ bytes::complete::tag, - error::{Error, ErrorKind, ParseError}, + error::{ErrorKind, ParseError}, sequence::delimited, IResult, }; diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs index 77f8ba1..b8a74bd 100644 --- a/src/parser/parse_from_text/find_range.rs +++ b/src/parser/parse_from_text/find_range.rs @@ -20,7 +20,7 @@ enum FindRangeResult<'a> { /// - `code` the u32 to look for a range for. /// /// - `ranges` a refernce to a slice of `RangeInclusive` -fn find_range_for_char<'a>(code: u32, ranges: &[RangeInclusive]) -> FindRangeResult<'a> { +fn find_range_for_char<'a>(code: u32, ranges: &'a [RangeInclusive]) -> FindRangeResult<'a> { let index = ranges.binary_search_by_key(&code, |range| *range.start()); match index { Ok(_) => FindRangeResult::WasOnRangeStart, diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 4905ed1..cbac20a 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -1,22 +1,22 @@ -use super::base_parsers::*; -use super::find_range::is_in_one_of_ranges; -use super::Element; -use crate::nom::{Offset, Slice}; -use crate::parser::link_url::LinkDestination; +use std::ops::RangeInclusive; + use nom::{ branch::alt, - bytes::complete::{tag, take, take_while, take_while1, take_while_m_n}, + bytes::complete::{tag, take_while, take_while_m_n}, character::complete::{char, u8}, - combinator::{opt, peek, recognize, verify}, + combinator::{opt, recognize}, multi::{count, many0, many1, many_m_n}, - sequence::{delimited, preceded, tuple}, + sequence::{delimited, tuple}, AsChar, IResult, }; -use std::ops::RangeInclusive; + +use super::find_range::is_in_one_of_ranges; +use super::Element; +use crate::parser::link_url::{LinkDestination, PunycodeWarning}; // Link syntax here is according to RFC 3986 & 3987 --Farooq -fn is_alpha(c: char) -> bool{ +fn is_alpha(c: char) -> bool { c.is_alphabetic() } @@ -196,32 +196,42 @@ fn parse_host(input: &str) -> IResult<&str, (&str, bool)> { } fn take_while_ireg(input: &str) -> IResult<&str, &str> { - alt((recognize(many0(take_while_pct_encoded)), take_while(is_ireg_name_not_pct_encoded)))(input) + alt(( + recognize(many0(take_while_pct_encoded)), + take_while(is_ireg_name_not_pct_encoded), + ))(input) } fn is_userinfo_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) } -fn iauthority(input: &str) -> IResult<&str, (&str, &str, &str, bool)> { +fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool)> /* (iauthority, host, bool) */ { + let i = <&str>::clone(&input); let (input, userinfo) = opt(recognize(tuple((take_while_iuserinfo, char('@')))))(input)?; let (input, (host, is_ipv6_or_future)) = parse_host(input)?; - let (input, port) = preceded(char(':'), take_while(is_digit))(input)?; + let (input, port) = opt(recognize(tuple((char(':'), take_while(is_digit)))))(input)?; let userinfo = userinfo.unwrap_or(""); - Ok((input, (userinfo, host, port, is_ipv6_or_future))) + let port = port.unwrap_or(""); + let len = userinfo.len() + host.len() + port.len(); + Ok((input, (&i[0..len], host, is_ipv6_or_future))) } fn take_while_iuserinfo(input: &str) -> IResult<&str, &str> { - alt((recognize(many0(take_while_pct_encoded)), take_while(is_iuserinfo_not_pct_encoded)))(input) + alt(( + recognize(many0(take_while_pct_encoded)), + take_while(is_iuserinfo_not_pct_encoded), + ))(input) } fn is_iuserinfo_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) || c == ':' } -fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { - let (input, (userinfo, host, port, is_ipv6_or_future)) = - preceded(tag("//"), iauthority)(input)?; +fn ihier_part(input: &str) -> IResult<&str, (&str, &str, bool)> { + let i = <&str>::clone(&input); + let (input, _double_slash) = tag("//")(input)?; + let (input, (authority, host, is_ipv6_or_future)) = iauthority(input)?; let (input, path) = opt(alt(( recognize(tuple(( char('/'), @@ -236,7 +246,9 @@ fn ihier_part(input: &str) -> IResult<&str, (&str, &str, &str, &str, bool)> { ))), // ipath_rootless )))(input)?; let path = path.unwrap_or(""); // it's ipath_empty - Ok((input, (userinfo, host, port, path, is_ipv6_or_future))) + let len = 2 + authority.len() + path.len(); + // 2 is for double_slash + Ok((input, (&i[0..len], host, is_ipv6_or_future))) } fn is_ipchar_not_pct_encoded(c: char) -> bool { @@ -244,11 +256,17 @@ fn is_ipchar_not_pct_encoded(c: char) -> bool { } fn take_while_ipchar(input: &str) -> IResult<&str, &str> { - recognize(many0(alt((take_while(is_ipchar_not_pct_encoded), take_while_pct_encoded))))(input) + recognize(many0(alt(( + take_while(is_ipchar_not_pct_encoded), + take_while_pct_encoded, + ))))(input) } fn take_while_ipchar1(input: &str) -> IResult<&str, &str> { - recognize(many1(alt((take_while(is_ipchar_not_pct_encoded), take_while_pct_encoded))))(input) + recognize(many1(alt(( + take_while(is_ipchar_not_pct_encoded), + take_while_pct_encoded, + ))))(input) } const IPRIVATE_RANGES: [RangeInclusive; 3] = @@ -263,11 +281,19 @@ fn is_iquery_not_pct_encoded(c: char) -> bool { } fn iquery(input: &str) -> IResult<&str, &str> { - recognize(many0(alt((take_while(is_iquery_not_pct_encoded), take_while_pct_encoded))))(input) + recognize(many0(alt(( + take_while(is_iquery_not_pct_encoded), + take_while_pct_encoded, + ))))(input) } fn take_while_ifragment(input: &str) -> IResult<&str, &str> { - recognize(many0(alt((take_while_ipchar, take_while_pct_encoded, tag("/"), tag("?")))))(input) + recognize(many0(alt(( + take_while_ipchar, + take_while_pct_encoded, + tag("/"), + tag("?"), + ))))(input) } fn scheme(input: &str) -> IResult<&str, &str> { @@ -285,21 +311,59 @@ fn take_while_pct_encoded(input: &str) -> IResult<&str, &str> { recognize(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit))))(input) } +fn punycode_encode(host: &str) -> String { + host.split('.') + .map(|sub| { + format!( + "xn--{}", + unic_idna_punycode::encode_str(sub) + .unwrap_or_else(|| "[punycode encode failed]".to_owned()) + ) + }) + .collect::>() + .join(".") +} + +fn is_puny(host: &str) -> bool { + for ch in host.chars() { + if !(is_alphanum_or_hyphen_minus(ch) || ch == '.') { + return true; + } + } + false +} + +fn get_puny_code_warning(link: &str, host: &str) -> Option { + if is_puny(host) { + let ascii_hostname = punycode_encode(host); + Some(PunycodeWarning { + original_hostname: host.to_owned(), + ascii_hostname: ascii_hostname.to_owned(), + punycode_encoded_url: link.replacen(host, &ascii_hostname, 1) + }) + } else { + None + } +} + pub fn link(input: &str) -> IResult<&str, Element> { let input_ = <&str>::clone(&input); - let (input_, scheme) = scheme(input)?; - let (input_, (userinfo, host, port, path, is_ipv6_or_future)) = ihier_part(input)?; - let (input_, Some(query)) = opt(preceded(char('?'), iquery))(input)?; - let (input_, Some(fragment)) = opt(preceded(char('#'), take_while_ifragment))(input)?; - let mut s = format!("{scheme}://{userinfo}@{host}:{port}{path}?{query}#{fragment}"); + let (input, scheme) = scheme(input)?; + let (input, (ihier, host, is_ipv6_or_future)) = ihier_part(input)?; + let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?; + let (input_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; + let query = query.unwrap_or(""); + let fragment = fragment.unwrap_or(""); + let len = scheme.len() + ihier.len() + query.len() + fragment.len(); + let link = &input_[0..len]; Ok(( - &input[0..s.len()], + input, Element::Link { destination: LinkDestination { - target: &input[(scheme.len() + 3)..(userinfo.len() + 2 + host.len() + port.len())], - hostname: Some(host), - punycode: None, - scheme: scheme, + target: link, + hostname: if host.len() == 0 { None } else { Some(host) }, + punycode: if is_ipv6_or_future { None } else { get_puny_code_warning(link, host) } , + scheme, }, }, )) diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index cd82821..393d681 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -102,6 +102,12 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< Ok((rest, link)) => (rest, link), Err(nom::Err::Error(err)) => { return Err(nom::Err::Error(CustomError::Nom(err.input, err.code))); + }, + Err(nom::Err::Incomplete(err)) => { + return Err(nom::Err::Incomplete(err)); + }, + Err(nom::Err::Failure(err)) => { + return Err(nom::Err::Failure(CustomError::)); } }; if !rest.is_empty() { From f4dbe805b3640ffbd08f3663c5447b4c65327070 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 12 Mar 2024 16:53:36 +0330 Subject: [PATCH 18/74] fix some stuff --- src/parser/parse_from_text/base_parsers.rs | 6 ++++++ src/parser/parse_from_text/link_element.rs | 15 +++++++++------ src/parser/parse_from_text/markdown_elements.rs | 6 ++++-- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/parser/parse_from_text/base_parsers.rs b/src/parser/parse_from_text/base_parsers.rs index 9881d36..8b03b6a 100644 --- a/src/parser/parse_from_text/base_parsers.rs +++ b/src/parser/parse_from_text/base_parsers.rs @@ -97,3 +97,9 @@ impl From for Err> { } } */ +/* +impl From> for nom::Err> { + fn from(input: I, code: ErrorKind) -> nom::Err> { + nom::Err(CustomError::Nom(input, code) + } +}*/ diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index cbac20a..7dc624f 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -10,7 +10,10 @@ use nom::{ AsChar, IResult, }; -use super::find_range::is_in_one_of_ranges; +use super::{ + find_range::is_in_one_of_ranges, + base_parsers::CustomError, +}; use super::Element; use crate::parser::link_url::{LinkDestination, PunycodeWarning}; @@ -181,7 +184,7 @@ fn ip_literal(input: &str) -> IResult<&str, &str> { /// /// Parse host. Returns the rest, the host string and a boolean indicating /// if it is IPvFuture or IPv6. -fn parse_host(input: &str) -> IResult<&str, (&str, bool)> { +fn parse_host(input: &str) -> IResult<&str, (&str, bool), CustomError<&str>> { match ip_literal(input) { Ok((input, host)) => { // It got parsed, then it's an IP Literal meaning @@ -206,7 +209,7 @@ fn is_userinfo_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) } -fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool)> /* (iauthority, host, bool) */ { +fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> /* (iauthority, host, bool) */ { let i = <&str>::clone(&input); let (input, userinfo) = opt(recognize(tuple((take_while_iuserinfo, char('@')))))(input)?; let (input, (host, is_ipv6_or_future)) = parse_host(input)?; @@ -228,7 +231,7 @@ fn is_iuserinfo_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) || c == ':' } -fn ihier_part(input: &str) -> IResult<&str, (&str, &str, bool)> { +fn ihier_part(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> { let i = <&str>::clone(&input); let (input, _double_slash) = tag("//")(input)?; let (input, (authority, host, is_ipv6_or_future)) = iauthority(input)?; @@ -296,7 +299,7 @@ fn take_while_ifragment(input: &str) -> IResult<&str, &str> { ))))(input) } -fn scheme(input: &str) -> IResult<&str, &str> { +fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { take_while(is_scheme)(input) } @@ -346,7 +349,7 @@ fn get_puny_code_warning(link: &str, host: &str) -> Option { } } -pub fn link(input: &str) -> IResult<&str, Element> { +pub fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { let input_ = <&str>::clone(&input); let (input, scheme) = scheme(input)?; let (input, (ihier, host, is_ipv6_or_future)) = ihier_part(input)?; diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 393d681..5dd2e61 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -98,7 +98,8 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< if content.is_empty() { return Err(nom::Err::Error(CustomError::NoContent)); } - let (rest, link) = match link(content) { + /* + let (rest, link) = match link(content)?; { Ok((rest, link)) => (rest, link), Err(nom::Err::Error(err)) => { return Err(nom::Err::Error(CustomError::Nom(err.input, err.code))); @@ -109,7 +110,8 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< Err(nom::Err::Failure(err)) => { return Err(nom::Err::Failure(CustomError::)); } - }; + };*/ + let (rest, link) = link(input)?; if !rest.is_empty() { return Err(nom::Err::Error(CustomError::UnexpectedContent)); } From 9e4f74253ce65c78affbff58a1f4798bd75f1709 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 12 Mar 2024 17:03:48 +0330 Subject: [PATCH 19/74] IT'S COMPILING, MAN \:D/ --- src/parser/parse_from_text/link_element.rs | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 7dc624f..9b18a37 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -85,7 +85,7 @@ fn is_ipv4(c: char) -> bool { is_digit(c) || c == '.' } -fn ipv4(input: &str) -> IResult<&str, &str> { +fn ipv4(input: &str) -> IResult<&str, &str, CustomError<&str>> { let (input, ipv4_) = recognize(tuple((u8, char('.'), u8, char('.'), u8, char('.'), u8)))(input)?; Ok((input, ipv4_)) @@ -95,11 +95,11 @@ fn is_ireg_name_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) } -fn h16(input: &str) -> IResult<&str, &str> { +fn h16(input: &str) -> IResult<&str, &str, CustomError<&str>> { take_while_m_n(1, 4, is_hex_digit)(input) } -fn ls32(input: &str) -> IResult<&str, &str> { +fn ls32(input: &str) -> IResult<&str, &str, CustomError<&str>> { let result = recognize(tuple((h16, char(':'), h16)))(input); if result.is_err() { ipv4(input) @@ -108,15 +108,15 @@ fn ls32(input: &str) -> IResult<&str, &str> { } } -fn h16_and_period(input: &str) -> IResult<&str, &str> { +fn h16_and_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((h16, char(':'))))(input) } -fn double_period(input: &str) -> IResult<&str, &str> { +fn double_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { tag("::")(input) } -fn ipv6(input: &str) -> IResult<&str, &str> { +fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { alt(( recognize(tuple((count(h16_and_period, 6), ls32))), recognize(tuple((double_period, many_m_n(5, 5, h16_and_period), ls32))), @@ -165,7 +165,7 @@ fn is_ipvfuture_last(c: char) -> bool { is_unreserved(c) || is_sub_delim(c) || c == ':' } -fn ipvfuture(input: &str) -> IResult<&str, &str> { +fn ipvfuture(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple(( char('v'), take_while_m_n(1, 1, is_hex_digit), @@ -174,7 +174,7 @@ fn ipvfuture(input: &str) -> IResult<&str, &str> { )))(input) } -fn ip_literal(input: &str) -> IResult<&str, &str> { +fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { delimited(char('['), alt((ipv6, ipvfuture)), char(']'))(input) } @@ -198,7 +198,7 @@ fn parse_host(input: &str) -> IResult<&str, (&str, bool), CustomError<&str>> { } } -fn take_while_ireg(input: &str) -> IResult<&str, &str> { +fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> { alt(( recognize(many0(take_while_pct_encoded)), take_while(is_ireg_name_not_pct_encoded), @@ -220,7 +220,7 @@ fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str Ok((input, (&i[0..len], host, is_ipv6_or_future))) } -fn take_while_iuserinfo(input: &str) -> IResult<&str, &str> { +fn take_while_iuserinfo(input: &str) -> IResult<&str, &str, CustomError<&str>> { alt(( recognize(many0(take_while_pct_encoded)), take_while(is_iuserinfo_not_pct_encoded), @@ -258,14 +258,14 @@ fn is_ipchar_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) || matches!(c, ':' | '@') } -fn take_while_ipchar(input: &str) -> IResult<&str, &str> { +fn take_while_ipchar(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(alt(( take_while(is_ipchar_not_pct_encoded), take_while_pct_encoded, ))))(input) } -fn take_while_ipchar1(input: &str) -> IResult<&str, &str> { +fn take_while_ipchar1(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many1(alt(( take_while(is_ipchar_not_pct_encoded), take_while_pct_encoded, @@ -283,14 +283,14 @@ fn is_iquery_not_pct_encoded(c: char) -> bool { is_iprivate(c) || is_ipchar_not_pct_encoded(c) || matches!(c, '/' | '?') } -fn iquery(input: &str) -> IResult<&str, &str> { +fn iquery(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(alt(( take_while(is_iquery_not_pct_encoded), take_while_pct_encoded, ))))(input) } -fn take_while_ifragment(input: &str) -> IResult<&str, &str> { +fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(alt(( take_while_ipchar, take_while_pct_encoded, @@ -310,7 +310,7 @@ fn is_alphanum_or_hyphen_minus(char: char) -> bool { } } -fn take_while_pct_encoded(input: &str) -> IResult<&str, &str> { +fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit))))(input) } From 6f16d665b602fbab346bb520134d75deee3734ac Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 12 Mar 2024 18:07:26 +0330 Subject: [PATCH 20/74] fix some problems with is_in_one_of_ranges and updating the code: --- src/parser/parse_from_text/find_range.rs | 5 +- .../hashtag_content_char_ranges.rs | 48 +++++++++---------- src/parser/parse_from_text/link_element.rs | 19 ++++++-- .../parse_from_text/markdown_elements.rs | 4 +- src/parser/parse_from_text/text_elements.rs | 4 +- 5 files changed, 45 insertions(+), 35 deletions(-) diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs index b8a74bd..3090ed5 100644 --- a/src/parser/parse_from_text/find_range.rs +++ b/src/parser/parse_from_text/find_range.rs @@ -39,11 +39,10 @@ fn find_range_for_char<'a>(code: u32, ranges: &'a [RangeInclusive]) -> Find /// /// # Arguments /// -/// - `c` A character +/// - `c` A number(u32) /// /// - `ranges` A sorted slice of ranges to see if `c` is in anyone of them -pub fn is_in_one_of_ranges(c: char, ranges: &[RangeInclusive]) -> bool { - let c = c as u32; +pub fn is_in_one_of_ranges(c: u32, ranges: &[RangeInclusive]) -> bool { match find_range_for_char(c, ranges) { FindRangeResult::WasOnRangeStart => true, FindRangeResult::Range(range) => range.contains(&c), diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index 1c934fe..642943d 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -876,41 +876,39 @@ pub(crate) fn hashtag_content_char(c: char) -> bool { } else if matches!(c, '+' | '-' | '_') { true } else { - is_in_one_of_ranges(c, &HASHTAG_CONTENT_CHAR_RANGES[..]) + is_in_one_of_ranges(c as u32, &HASHTAG_CONTENT_CHAR_RANGES[..]) } } #[cfg(test)] mod test { use crate::parser::parse_from_text::hashtag_content_char_ranges::hashtag_content_char; - - use super::{find_range_for_char, FindRangeResult, RangeInclusive}; + use crate::parser::parse_from_text::find_range::is_in_one_of_ranges; + use std::ops::RangeInclusive; #[test] fn test_range_function() { - // these must return WasOnRangeStart - let codes: Vec = vec![0x30000, 0xe0100, 0x23, 0x30, 0x171f, 0x176e, 0x10fb0]; - for code in codes.iter() { - assert_eq!(find_range_for_char(*code), FindRangeResult::WasOnRangeStart); - } - - // these must be return associated ranges - let codes: Vec<(u32, RangeInclusive)> = vec![ - (0x11066 + 5, 0x11066..=0x11075), // in range - (0x11000 + 10, 0x11000..=0x11046), // in range - (0x11046 + 2, 0x11000..=0x11046), // out of range - (0x10, 0x23..=0x23), - (0x09, 0x23..=0x23), - (0x0, 0x23..=0x23), - (0x25, 0x23..=0x23), - (0x2a + 1, 0x2a..=0x2a), - (0xfffff, 0xe0100..=0xe01ef), - // ^ this is beyond ranges and must return the - // last range + let ranges: [RangeInclusive; 5] = [ + 0x0..=0x30, + 0x99..=0x99, + 0x1f..=0x2f, + 0xff..=0xff, + 0x1000f..=0x20000, ]; - - for (code, range) in codes.iter() { - assert_eq!(find_range_for_char(*code), FindRangeResult::Range(range)); + let codes: Vec<(u32, bool)> = vec![ + (0x30000, false), + (0x01, true), + (0x23, true), + (0x30, false), + (0x171f, false), + (0x176e, false), + (0x10fb0, true), + (0x0, true), + (0xf1, false) + ]; + for (code, result) in codes.iter() { + assert_eq!(is_in_one_of_ranges(*code, &ranges[..]), *result); + println!("{code}, {result}"); } } diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index 9b18a37..d8bff04 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -53,7 +53,7 @@ const UCSCHAR_RANGES: [RangeInclusive; 17] = [ ]; fn is_ucschar(c: char) -> bool { - is_in_one_of_ranges(c, &UCSCHAR_RANGES[..]) + is_in_one_of_ranges(c as u32, &UCSCHAR_RANGES[..]) } fn is_unreserved(c: char) -> bool { @@ -276,7 +276,7 @@ const IPRIVATE_RANGES: [RangeInclusive; 3] = [0xe000..=0xf8ff, 0xf0000..=0xffffd, 0x100000..=0x10fffd]; fn is_iprivate(c: char) -> bool { - is_in_one_of_ranges(c, &IPRIVATE_RANGES[..]) + is_in_one_of_ranges(c as u32, &IPRIVATE_RANGES[..]) } fn is_iquery_not_pct_encoded(c: char) -> bool { @@ -349,7 +349,7 @@ fn get_puny_code_warning(link: &str, host: &str) -> Option { } } -pub fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { +fn parse_iri(input: &str) -> IResult<&str, Element, CustomError<&str>> { let input_ = <&str>::clone(&input); let (input, scheme) = scheme(input)?; let (input, (ihier, host, is_ipv6_or_future)) = ihier_part(input)?; @@ -371,3 +371,16 @@ pub fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { }, )) } + +fn parse_irelative_ref(input: &str) -> IResult<&str, Element, CustomError<&str>> { + todo!() +} + +pub fn parse_link(input: &str) -> IResult<&str, Element, CustomError<&str>> { + /* + match parse_iri(input) { + Ok((input, iri)) => Ok((input, iri)), + Err(..) => parse_irelative_ref(input), + }*/ + parse_iri(input) +} diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 5dd2e61..7123c21 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -7,7 +7,7 @@ use super::text_elements::parse_text_element; use super::Element; use super::{base_parsers::*, parse_all}; use crate::parser::link_url::LinkDestination; -use crate::parser::parse_from_text::link_element::link; +use crate::parser::parse_from_text::link_element::parse_link; ///! nom parsers for markdown elements use nom::{ bytes::complete::{is_not, tag, take, take_while}, @@ -111,7 +111,7 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< return Err(nom::Err::Failure(CustomError::)); } };*/ - let (rest, link) = link(input)?; + let (rest, link) = parse_link(input)?; if !rest.is_empty() { return Err(nom::Err::Error(CustomError::UnexpectedContent)); } diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index a04d040..27e5b97 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -4,7 +4,7 @@ use crate::parser::link_url::LinkDestination; use super::base_parsers::CustomError; use super::base_parsers::*; use super::hashtag_content_char_ranges::hashtag_content_char; -use super::link_element::link; +use super::link_element::parse_link; use super::Element; use crate::nom::{Offset, Slice}; use nom::bytes::complete::take_while; @@ -280,7 +280,7 @@ pub(crate) fn parse_text_element( Ok((i, elm)) } else if let Ok((i, elm)) = email_address(input) { Ok((i, elm)) - } else if let Ok((i, elm)) = link(input) { + } else if let Ok((i, elm)) = parse_link(input) { Ok((i, elm)) } else if let Ok((i, _)) = linebreak(input) { Ok((i, Element::Linebreak)) From aa6d26d7c45a6f4ab554103809d5bbf5d6bcd578 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 12 Mar 2024 19:25:42 +0330 Subject: [PATCH 21/74] started refactoring --- src/parser/link_url.rs | 595 +++++++++++------- .../hashtag_content_char_ranges.rs | 6 +- src/parser/parse_from_text/link_element.rs | 385 ------------ src/parser/parse_from_text/mod.rs | 2 +- tests/text_to_ast/links.rs | 26 + 5 files changed, 402 insertions(+), 612 deletions(-) create mode 100644 tests/text_to_ast/links.rs diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 0ae9b06..8706abb 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -1,17 +1,23 @@ +use std::ops::RangeInclusive; + use nom::{ branch::alt, - bytes::complete::{tag, take, take_till1, take_while, take_while1}, - character::complete::char, - character::complete::digit1, - combinator::{consumed, opt, recognize}, + bytes::complete::{tag, take_while, take_while_m_n}, + character::complete::{char, u8}, + combinator::{opt, recognize}, error::{ErrorKind, ParseError}, - multi::many0, - sequence::delimited, - sequence::tuple, + multi::{count, many0, many1, many_m_n}, + sequence::{tuple, delimited}, AsChar, IResult, }; -use super::parse_from_text::base_parsers::{is_not_white_space, CustomError}; +use super::Element; +use super::parse_from_text::{ + base_parsers::{is_not_white_space, CustomError}, + find_range::is_in_one_of_ranges, +}; + +// Link syntax here is according to RFC 3986 & 3987 --Farooq ///! Parsing / Validation of URLs /// @@ -68,84 +74,31 @@ impl LinkDestination<'_> { pub(crate) fn parse_standalone_with_whitelist( input: &str, ) -> IResult<&str, LinkDestination, CustomError<&str>> { - if let Ok((rest, (link, info))) = parse_url(input) { - let (hostname, punycode, scheme) = match info { - UrlInfo::CommonInternetSchemeURL { - has_puny_code_in_host_name, - hostname, - ascii_hostname, - scheme, - } => { - if has_puny_code_in_host_name { - ( - Some(hostname), - Some(PunycodeWarning { - original_hostname: hostname.to_owned(), - punycode_encoded_url: link.replacen(hostname, &ascii_hostname, 1), - ascii_hostname, - }), - scheme, - ) - } else { - (Some(hostname), None, scheme) - } - } - UrlInfo::GenericUrl { scheme } => { - if !is_allowed_generic_scheme(scheme) { - return Err(nom::Err::Error(CustomError::InvalidLink)); - } - (None, None, scheme) + if let Ok((rest, link)) = parse_link(input) { + if link.destination.hostname == "" { + // if it's a generic url like tel:+989164364485 + if !is_allowed_generic_scheme(scheme) { + Err(nom::Err::Error(CustomError::InvalidLink)) + } else { + Ok((rest, link.destination)) } - }; - - Ok(( - rest, - LinkDestination { - target: link, - hostname, - punycode, - scheme, - }, - )) + } else { + Ok(( + Some(link.destination.hostname), + Some(link.destination.punycode), + link.destination.scheme + )) + } } else { Err(nom::Err::Error(CustomError::InvalidLink)) } } pub fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { - if let Ok((rest, (link, info))) = parse_url(input) { - let (hostname, punycode, scheme) = match info { - UrlInfo::CommonInternetSchemeURL { - has_puny_code_in_host_name, - hostname, - ascii_hostname, - scheme, - } => { - if has_puny_code_in_host_name { - ( - Some(hostname), - Some(PunycodeWarning { - original_hostname: hostname.to_owned(), - punycode_encoded_url: link.replacen(hostname, &ascii_hostname, 1), - ascii_hostname, - }), - scheme, - ) - } else { - (Some(hostname), None, scheme) - } - } - UrlInfo::GenericUrl { scheme, .. } => (None, None, scheme), - }; - + if let Ok((rest, link_element))) = parse_link(input) { Ok(( rest, - LinkDestination { - target: link, - hostname, - punycode, - scheme, - }, + link_element.destination )) } else { Err(nom::Err::Error(CustomError::InvalidLink)) @@ -183,209 +136,405 @@ impl ParseError for LinkParseError { } } -fn is_reserved(char: char) -> bool { - matches!(char, ';' | '/' | '?' | ':' | '@' | '&' | '=') +fn is_alphanum_or_hyphen_minus(char: char) -> bool { + match char { + '-' => true, + _ => char.is_alphanum(), + } +} + + +fn is_alpha(c: char) -> bool { + c.is_alphabetic() } fn is_hex_digit(c: char) -> bool { c.is_ascii_hexdigit() } -fn escaped_char(input: &str) -> IResult<&str, &str, LinkParseError<&str>> { - let (input, content) = take(3usize)(input)?; - let mut content_chars = content.chars(); +fn is_digit(c: char) -> bool { + c.is_digit(10) +} - if content_chars.next() == Some('%') - && content_chars.next().map(is_hex_digit) == Some(true) - && content_chars.next().map(is_hex_digit) == Some(true) - { - Ok((input, content)) - } else { - Err(nom::Err::Error(LinkParseError::ThisIsNotPercentEncoding)) - } +// These ranges have been extracted from RFC3987, Page 8. +const UCSCHAR_RANGES: [RangeInclusive; 17] = [ + 0xa0..=0xd7ff, + 0xF900..=0xFDCF, + 0xFDF0..=0xFFEF, + 0x10000..=0x1FFFD, + 0x20000..=0x2FFFD, + 0x30000..=0x3FFFD, + 0x40000..=0x4FFFD, + 0x50000..=0x5FFFD, + 0x60000..=0x6FFFD, + 0x70000..=0x7FFFD, + 0x80000..=0x8FFFD, + 0x90000..=0x9FFFD, + 0xA0000..=0xAFFFD, + 0xB0000..=0xBFFFD, + 0xC0000..=0xCFFFD, + 0xD0000..=0xDFFFD, + 0xE1000..=0xEFFFD, +]; + +fn is_ucschar(c: char) -> bool { + is_in_one_of_ranges(c as u32, &UCSCHAR_RANGES[..]) +} + +fn is_unreserved(c: char) -> bool { + is_alpha(c) || is_digit(c) || is_other_unreserved(c) +} + +fn is_iunreserved(c: char) -> bool { + is_ucschar(c) || is_unreserved(c) } -fn is_safe(char: char) -> bool { - matches!(char, '$' | '-' | '_' | '.' | '+') +fn is_other_unreserved(c: char) -> bool { + matches!(c, '_' | '.' | '_' | '~') } -fn is_extra(char: char) -> bool { +fn is_sub_delim(c: char) -> bool { matches!( - char, - '!' | '*' | '\'' | '(' | ')' | ',' | '{' | '}' | '[' | ']' | '<' | '>' + c, + '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' ) } -fn is_unreserved(char: char) -> bool { - char.is_alphanum() || is_safe(char) || is_extra(char) +// Here again, order is important. As URLs/IRIs have letters in them +// most of the time and less digits or other characters. --Farooq +fn is_scheme(c: char) -> bool { + is_alpha(c) || is_digit(c) || is_scheme(c) } -fn x_char_sequence(input: &str) -> IResult<&str, &str, LinkParseError<&str>> { - //xchar = unreserved | reserved | escape - recognize(many0(alt(( - take_while1(is_unreserved), - take_while1(is_reserved), - escaped_char, - tag("#"), - ))))(input) +fn is_ipv4(c: char) -> bool { + is_digit(c) || c == '.' } -fn scheme_char(char: char) -> bool { - //; the scheme is in lower case; interpreters should use case-ignore - //scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] - match char { - '+' | '-' | '.' => true, - _ => char.is_alphanum(), +fn ipv4(input: &str) -> IResult<&str, &str, CustomError<&str>> { + let (input, ipv4_) = + recognize(tuple((u8, char('.'), u8, char('.'), u8, char('.'), u8)))(input)?; + Ok((input, ipv4_)) +} + +fn is_ireg_name_not_pct_encoded(c: char) -> bool { + is_iunreserved(c) || is_sub_delim(c) +} + +fn h16(input: &str) -> IResult<&str, &str, CustomError<&str>> { + take_while_m_n(1, 4, is_hex_digit)(input) +} + +fn ls32(input: &str) -> IResult<&str, &str, CustomError<&str>> { + let result = recognize(tuple((h16, char(':'), h16)))(input); + if result.is_err() { + ipv4(input) + } else { + result } } -fn is_user_or_password_char(char: char) -> bool { - match char { - ';' | '?' | '&' | '=' => true, - _ => is_unreserved(char), +fn h16_and_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(tuple((h16, char(':'))))(input) +} + +fn double_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { + tag("::")(input) +} + +fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { + alt(( + recognize(tuple((count(h16_and_period, 6), ls32))), + recognize(tuple((double_period, many_m_n(5, 5, h16_and_period), ls32))), + recognize(tuple(( + opt(h16), + double_period, + many_m_n(4, 4, h16_and_period), + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 1, h16_and_period),))), + double_period, + count(h16_and_period, 3), + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 2, h16_and_period), h16))), + double_period, + count(h16_and_period, 2), + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 3, h16_and_period), h16))), + double_period, + count(h16_and_period, 1), + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 4, h16_and_period), h16))), + double_period, + ls32, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 5, h16_and_period), h16))), + double_period, + h16, + ))), + recognize(tuple(( + opt(tuple((many_m_n(0, 6, h16_and_period), h16))), + double_period, + ))), + ))(input) +} + +fn is_ipvfuture_last(c: char) -> bool { + is_unreserved(c) || is_sub_delim(c) || c == ':' +} + +fn ipvfuture(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(tuple(( + char('v'), + take_while_m_n(1, 1, is_hex_digit), + char('.'), + take_while_m_n(1, 1, is_ipvfuture_last), + )))(input) +} + +fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { + delimited(char('['), alt((ipv6, ipvfuture)), char(']'))(input) +} + +/// Parse host +/// +/// # Description +/// +/// Parse host. Returns the rest, the host string and a boolean indicating +/// if it is IPvFuture or IPv6. +fn parse_host(input: &str) -> IResult<&str, (&str, bool), CustomError<&str>> { + match ip_literal(input) { + Ok((input, host)) => { + // It got parsed, then it's an IP Literal meaning + // it's either IPv6 or IPvFuture + Ok((input, (host, true))) + } + Err(..) => { + let (input, host) = alt((ipv4, take_while_ireg))(input)?; + Ok((input, (host, false))) + } } } -fn user_or_password(input: &str) -> IResult<&str, &str, LinkParseError<&str>> { +fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> { + alt(( + recognize(many0(take_while_pct_encoded)), + take_while(is_ireg_name_not_pct_encoded), + ))(input) +} + +fn is_userinfo_not_pct_encoded(c: char) -> bool { + is_iunreserved(c) || is_sub_delim(c) +} + +fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> /* (iauthority, host, bool) */ +{ + let i = <&str>::clone(&input); + let (input, userinfo) = opt(recognize(tuple((take_while_iuserinfo, char('@')))))(input)?; + let (input, (host, is_ipv6_or_future)) = parse_host(input)?; + let (input, port) = opt(recognize(tuple((char(':'), take_while(is_digit)))))(input)?; + let userinfo = userinfo.unwrap_or(""); + let port = port.unwrap_or(""); + let len = userinfo.len() + host.len() + port.len(); + Ok((input, (&i[0..len], host, is_ipv6_or_future))) +} + +fn take_while_iuserinfo(input: &str) -> IResult<&str, &str, CustomError<&str>> { + alt(( + recognize(many0(take_while_pct_encoded)), + take_while(is_iuserinfo_not_pct_encoded), + ))(input) +} + +fn is_iuserinfo_not_pct_encoded(c: char) -> bool { + is_iunreserved(c) || is_sub_delim(c) || c == ':' +} + +fn ihier_part(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> { + let i = <&str>::clone(&input); + let (input, _double_slash) = tag("//")(input)?; + let (input, (authority, host, is_ipv6_or_future)) = iauthority(input)?; + let (input, path) = opt(alt(( + recognize(tuple(( + char('/'), + opt(tuple(( + take_while_ipchar1, + many0(tuple((char('/'), take_while_ipchar))), + ))), + ))), // ipath-absolute + recognize(tuple(( + take_while_ipchar, + many0(tuple((char('/'), take_while_ipchar))), + ))), // ipath_rootless + )))(input)?; + let path = path.unwrap_or(""); // it's ipath_empty + let len = 2 + authority.len() + path.len(); + // 2 is for double_slash + Ok((input, (&i[0..len], host, is_ipv6_or_future))) +} + +fn is_ipchar_not_pct_encoded(c: char) -> bool { + is_iunreserved(c) || is_sub_delim(c) || matches!(c, ':' | '@') +} + +fn take_while_ipchar(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(alt(( - take_while(is_user_or_password_char), - escaped_char, + take_while(is_ipchar_not_pct_encoded), + take_while_pct_encoded, ))))(input) } -fn login(input: &str) -> IResult<&str, (), LinkParseError<&str>> { - // login = user [ ":" password ] "@" - let (input, _) = user_or_password(input)?; - let (input, _) = opt(tuple((char(':'), user_or_password)))(input)?; - let (input, _) = char('@')(input)?; - Ok((input, ())) +fn take_while_ipchar1(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(many1(alt(( + take_while(is_ipchar_not_pct_encoded), + take_while_pct_encoded, + ))))(input) } -fn is_ipv6_char(char: char) -> bool { - match char { - ':' => true, - _ => is_hex_digit(char), - } +const IPRIVATE_RANGES: [RangeInclusive; 3] = + [0xe000..=0xf8ff, 0xf0000..=0xffffd, 0x100000..=0x10fffd]; + +fn is_iprivate(c: char) -> bool { + is_in_one_of_ranges(c as u32, &IPRIVATE_RANGES[..]) } -fn is_alphanum_or_hyphen_minus(char: char) -> bool { - match char { - '-' => true, - _ => char.is_alphanum(), - } +fn is_iquery_not_pct_encoded(c: char) -> bool { + is_iprivate(c) || is_ipchar_not_pct_encoded(c) || matches!(c, '/' | '?') } -fn is_forbidden_in_idnalabel(char: char) -> bool { - is_reserved(char) || is_extra(char) || char == '>' + +fn iquery(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(many0(alt(( + take_while(is_iquery_not_pct_encoded), + take_while_pct_encoded, + ))))(input) } -/// creates possibility for punycodedecoded/unicode/internationalized domains -/// takes everything until reserved, extra or '>' -fn idnalabel(input: &str) -> IResult<&str, &str, LinkParseError<&str>> { - let (input, label) = take_till1(is_forbidden_in_idnalabel)(input)?; - Ok((input, label)) +fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(many0(alt(( + take_while_ipchar, + take_while_pct_encoded, + tag("/"), + tag("?"), + ))))(input) } -fn host<'a>(input: &'a str) -> IResult<&'a str, (&'a str, bool), LinkParseError<&'a str>> { - if let Ok((input, host)) = recognize::<_, _, LinkParseError<&'a str>, _>(delimited( - char('['), - take_while1(is_ipv6_char), - char(']'), - ))(input) - { - // ipv6 hostnumber - // sure the parsing here could be more specific and correct -> TODO - Ok((input, (host, true))) - } else if let Ok((input, host)) = recognize::<_, _, LinkParseError<&'a str>, _>(tuple(( - digit1, - char('.'), - digit1, - char('.'), - digit1, - char('.'), - digit1, - )))(input) - { - // ipv4 hostnumber - // sure the parsing here could be more specific and correct -> TODO - Ok((input, (host, false))) - } else { - // idna hostname (valid chars until ':' or '/') - // sure the parsing here could be more specific and correct -> TODO - let (input, host) = - recognize(tuple((many0(tuple((idnalabel, char('.')))), idnalabel)))(input)?; - Ok((input, (host, false))) - } +fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { + take_while(is_scheme)(input) +} + +fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit))))(input) } fn punycode_encode(host: &str) -> String { host.split('.') .map(|sub| { - format!( - "xn--{}", - unic_idna_punycode::encode_str(sub) - .unwrap_or_else(|| "[punycode encode failed]".to_owned()) - ) + let has_non_ascii_char: bool = sub + .chars() + .map(|ch| is_alphanum_or_hyphen_minus(ch)) + .reduce(|acc, e| e && acc) + .unwrap_or(false); + if has_non_ascii_char { + format!( + "xn--{}", + unic_idna_punycode::encode_str(sub) + .unwrap_or_else(|| "[punycode encode failed]".to_owned()) + ) + } else { + sub.to_owned() + } }) .collect::>() .join(".") } +fn is_puny(host: &str) -> bool { + for ch in host.chars() { + if !(is_alphanum_or_hyphen_minus(ch) || ch == '.') { + return true; + } + } + false +} -fn url_intern<'a>(input: &'a str) -> IResult<&'a str, UrlInfo<'a>, LinkParseError<&'a str>> { - let (input, scheme) = take_while1(scheme_char)(input)?; - let (input, _) = tag(":")(input)?; - - if let Ok((input, _)) = tag::<&'a str, &'a str, LinkParseError<&'a str>>("//")(input) { - // ip-schemepart - // parse login - let (input, _) = opt(login)(input)?; - // parse host - let (input, (host, is_ipv6)) = host(input)?; - // parse port - let (input, _) = opt(tuple((char(':'), digit1)))(input)?; - // parse urlpath - let (input, _) = opt(tuple(( - alt((char('/'), char('?'), char('#'))), - x_char_sequence, - )))(input)?; - - let is_puny = if is_ipv6 { - false - } else { - let mut is_puny = false; - for char in host.chars() { - if !(is_alphanum_or_hyphen_minus(char) || char == '.') { - is_puny = true; - break; - } - } - is_puny - }; +fn get_puny_code_warning(link: &str, host: &str) -> Option { + if is_puny(host) { + let ascii_hostname = punycode_encode(host); + Some(PunycodeWarning { + original_hostname: host.to_owned(), + ascii_hostname: ascii_hostname.to_owned(), + punycode_encoded_url: link.replacen(host, &ascii_hostname, 1), + }) + } else { + None + } +} - Ok(( - input, - UrlInfo::CommonInternetSchemeURL { - scheme, - hostname: host, - has_puny_code_in_host_name: is_puny, - ascii_hostname: if is_puny { - punycode_encode(host) +// IRI links per RFC3987 and RFC3986 +fn parse_iri(input: &str) -> IResult<&str, Element, CustomError<&str>> { + let input_ = <&str>::clone(&input); + let (input, scheme) = scheme(input)?; + let (input, (ihier, host, is_ipv6_or_future)) = ihier_part(input)?; + let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?; + let (input_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; + let query = query.unwrap_or(""); + let fragment = fragment.unwrap_or(""); + let len = scheme.len() + ihier.len() + query.len() + fragment.len(); + let link = &input_[0..len]; + Ok(( + input, + Element::Link { + destination: LinkDestination { + target: link, + hostname: if host.len() == 0 { None } else { Some(host) }, + punycode: if is_ipv6_or_future { + None } else { - host.to_string() + get_puny_code_warning(link, host) }, + scheme, }, - )) - } else { - // schemepart - let (input, _) = take_while(is_not_white_space)(input)?; + }, + )) +} - Ok((input, UrlInfo::GenericUrl { scheme })) - } + +// For future +fn parse_irelative_ref(input: &str) -> IResult<&str, Element, CustomError<&str>> { + todo!() } -fn parse_url(input: &str) -> IResult<&str, (&str, UrlInfo), LinkParseError<&str>> { - consumed(url_intern)(input) + +// White listed links in this format: scheme:some_char like tel:+989164364485 +fn parse_generic(input: &str) -> IResult<&str, Element, CustomError<&str>> { + let (input, scheme) = scheme(input)?; + let (input, target) = take_while(is_not_white_space)(input)?; + + Ok((input, Element::Link { + destination: LinkDestination { + scheme, + target, + hostname: None, + punycode: None, + } + })) } + +pub fn parse_link(input: &str) -> IResult<&str, Element, CustomError<&str>> { + /* + match parse_iri(input) { + Ok((input, iri)) => Ok((input, iri)), + Err(..) => parse_irelative_ref(input), + }*/ + alt((parse_iri, parse_generic))(input) +} // TODO testcases // ipv6 https://[::1]/ diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index 642943d..a113252 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -882,13 +882,13 @@ pub(crate) fn hashtag_content_char(c: char) -> bool { #[cfg(test)] mod test { - use crate::parser::parse_from_text::hashtag_content_char_ranges::hashtag_content_char; use crate::parser::parse_from_text::find_range::is_in_one_of_ranges; + use crate::parser::parse_from_text::hashtag_content_char_ranges::hashtag_content_char; use std::ops::RangeInclusive; #[test] fn test_range_function() { - let ranges: [RangeInclusive; 5] = [ + let ranges: [RangeInclusive; 5] = [ 0x0..=0x30, 0x99..=0x99, 0x1f..=0x2f, @@ -904,7 +904,7 @@ mod test { (0x176e, false), (0x10fb0, true), (0x0, true), - (0xf1, false) + (0xf1, false), ]; for (code, result) in codes.iter() { assert_eq!(is_in_one_of_ranges(*code, &ranges[..]), *result); diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs index d8bff04..8b13789 100644 --- a/src/parser/parse_from_text/link_element.rs +++ b/src/parser/parse_from_text/link_element.rs @@ -1,386 +1 @@ -use std::ops::RangeInclusive; -use nom::{ - branch::alt, - bytes::complete::{tag, take_while, take_while_m_n}, - character::complete::{char, u8}, - combinator::{opt, recognize}, - multi::{count, many0, many1, many_m_n}, - sequence::{delimited, tuple}, - AsChar, IResult, -}; - -use super::{ - find_range::is_in_one_of_ranges, - base_parsers::CustomError, -}; -use super::Element; -use crate::parser::link_url::{LinkDestination, PunycodeWarning}; - -// Link syntax here is according to RFC 3986 & 3987 --Farooq - -fn is_alpha(c: char) -> bool { - c.is_alphabetic() -} - -fn is_hex_digit(c: char) -> bool { - c.is_ascii_hexdigit() -} - -fn is_digit(c: char) -> bool { - c.is_digit(10) -} - -// These ranges have been extracted from RFC3987, Page 8. -const UCSCHAR_RANGES: [RangeInclusive; 17] = [ - 0xa0..=0xd7ff, - 0xF900..=0xFDCF, - 0xFDF0..=0xFFEF, - 0x10000..=0x1FFFD, - 0x20000..=0x2FFFD, - 0x30000..=0x3FFFD, - 0x40000..=0x4FFFD, - 0x50000..=0x5FFFD, - 0x60000..=0x6FFFD, - 0x70000..=0x7FFFD, - 0x80000..=0x8FFFD, - 0x90000..=0x9FFFD, - 0xA0000..=0xAFFFD, - 0xB0000..=0xBFFFD, - 0xC0000..=0xCFFFD, - 0xD0000..=0xDFFFD, - 0xE1000..=0xEFFFD, -]; - -fn is_ucschar(c: char) -> bool { - is_in_one_of_ranges(c as u32, &UCSCHAR_RANGES[..]) -} - -fn is_unreserved(c: char) -> bool { - is_alpha(c) || is_digit(c) || is_other_unreserved(c) -} - -fn is_iunreserved(c: char) -> bool { - is_ucschar(c) || is_unreserved(c) -} - -fn is_other_unreserved(c: char) -> bool { - matches!(c, '_' | '.' | '_' | '~') -} - -fn is_sub_delim(c: char) -> bool { - matches!( - c, - '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' - ) -} - -// Here again, order is important. As URLs/IRIs have letters in them -// most of the time and less digits or other characters. --Farooq -fn is_scheme(c: char) -> bool { - is_alpha(c) || is_digit(c) || is_scheme(c) -} - -fn is_ipv4(c: char) -> bool { - is_digit(c) || c == '.' -} - -fn ipv4(input: &str) -> IResult<&str, &str, CustomError<&str>> { - let (input, ipv4_) = - recognize(tuple((u8, char('.'), u8, char('.'), u8, char('.'), u8)))(input)?; - Ok((input, ipv4_)) -} - -fn is_ireg_name_not_pct_encoded(c: char) -> bool { - is_iunreserved(c) || is_sub_delim(c) -} - -fn h16(input: &str) -> IResult<&str, &str, CustomError<&str>> { - take_while_m_n(1, 4, is_hex_digit)(input) -} - -fn ls32(input: &str) -> IResult<&str, &str, CustomError<&str>> { - let result = recognize(tuple((h16, char(':'), h16)))(input); - if result.is_err() { - ipv4(input) - } else { - result - } -} - -fn h16_and_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple((h16, char(':'))))(input) -} - -fn double_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { - tag("::")(input) -} - -fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { - alt(( - recognize(tuple((count(h16_and_period, 6), ls32))), - recognize(tuple((double_period, many_m_n(5, 5, h16_and_period), ls32))), - recognize(tuple(( - opt(h16), - double_period, - many_m_n(4, 4, h16_and_period), - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 1, h16_and_period),))), - double_period, - count(h16_and_period, 3), - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 2, h16_and_period), h16))), - double_period, - count(h16_and_period, 2), - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 3, h16_and_period), h16))), - double_period, - count(h16_and_period, 1), - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 4, h16_and_period), h16))), - double_period, - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 5, h16_and_period), h16))), - double_period, - h16, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 6, h16_and_period), h16))), - double_period, - ))), - ))(input) -} - -fn is_ipvfuture_last(c: char) -> bool { - is_unreserved(c) || is_sub_delim(c) || c == ':' -} - -fn ipvfuture(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple(( - char('v'), - take_while_m_n(1, 1, is_hex_digit), - char('.'), - take_while_m_n(1, 1, is_ipvfuture_last), - )))(input) -} - -fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { - delimited(char('['), alt((ipv6, ipvfuture)), char(']'))(input) -} - -/// Parse host -/// -/// # Description -/// -/// Parse host. Returns the rest, the host string and a boolean indicating -/// if it is IPvFuture or IPv6. -fn parse_host(input: &str) -> IResult<&str, (&str, bool), CustomError<&str>> { - match ip_literal(input) { - Ok((input, host)) => { - // It got parsed, then it's an IP Literal meaning - // it's either IPv6 or IPvFuture - Ok((input, (host, true))) - } - Err(..) => { - let (input, host) = alt((ipv4, take_while_ireg))(input)?; - Ok((input, (host, false))) - } - } -} - -fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> { - alt(( - recognize(many0(take_while_pct_encoded)), - take_while(is_ireg_name_not_pct_encoded), - ))(input) -} - -fn is_userinfo_not_pct_encoded(c: char) -> bool { - is_iunreserved(c) || is_sub_delim(c) -} - -fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> /* (iauthority, host, bool) */ { - let i = <&str>::clone(&input); - let (input, userinfo) = opt(recognize(tuple((take_while_iuserinfo, char('@')))))(input)?; - let (input, (host, is_ipv6_or_future)) = parse_host(input)?; - let (input, port) = opt(recognize(tuple((char(':'), take_while(is_digit)))))(input)?; - let userinfo = userinfo.unwrap_or(""); - let port = port.unwrap_or(""); - let len = userinfo.len() + host.len() + port.len(); - Ok((input, (&i[0..len], host, is_ipv6_or_future))) -} - -fn take_while_iuserinfo(input: &str) -> IResult<&str, &str, CustomError<&str>> { - alt(( - recognize(many0(take_while_pct_encoded)), - take_while(is_iuserinfo_not_pct_encoded), - ))(input) -} - -fn is_iuserinfo_not_pct_encoded(c: char) -> bool { - is_iunreserved(c) || is_sub_delim(c) || c == ':' -} - -fn ihier_part(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> { - let i = <&str>::clone(&input); - let (input, _double_slash) = tag("//")(input)?; - let (input, (authority, host, is_ipv6_or_future)) = iauthority(input)?; - let (input, path) = opt(alt(( - recognize(tuple(( - char('/'), - opt(tuple(( - take_while_ipchar1, - many0(tuple((char('/'), take_while_ipchar))), - ))), - ))), // ipath-absolute - recognize(tuple(( - take_while_ipchar, - many0(tuple((char('/'), take_while_ipchar))), - ))), // ipath_rootless - )))(input)?; - let path = path.unwrap_or(""); // it's ipath_empty - let len = 2 + authority.len() + path.len(); - // 2 is for double_slash - Ok((input, (&i[0..len], host, is_ipv6_or_future))) -} - -fn is_ipchar_not_pct_encoded(c: char) -> bool { - is_iunreserved(c) || is_sub_delim(c) || matches!(c, ':' | '@') -} - -fn take_while_ipchar(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(many0(alt(( - take_while(is_ipchar_not_pct_encoded), - take_while_pct_encoded, - ))))(input) -} - -fn take_while_ipchar1(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(many1(alt(( - take_while(is_ipchar_not_pct_encoded), - take_while_pct_encoded, - ))))(input) -} - -const IPRIVATE_RANGES: [RangeInclusive; 3] = - [0xe000..=0xf8ff, 0xf0000..=0xffffd, 0x100000..=0x10fffd]; - -fn is_iprivate(c: char) -> bool { - is_in_one_of_ranges(c as u32, &IPRIVATE_RANGES[..]) -} - -fn is_iquery_not_pct_encoded(c: char) -> bool { - is_iprivate(c) || is_ipchar_not_pct_encoded(c) || matches!(c, '/' | '?') -} - -fn iquery(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(many0(alt(( - take_while(is_iquery_not_pct_encoded), - take_while_pct_encoded, - ))))(input) -} - -fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(many0(alt(( - take_while_ipchar, - take_while_pct_encoded, - tag("/"), - tag("?"), - ))))(input) -} - -fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { - take_while(is_scheme)(input) -} - -fn is_alphanum_or_hyphen_minus(char: char) -> bool { - match char { - '-' => true, - _ => char.is_alphanum(), - } -} - -fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit))))(input) -} - -fn punycode_encode(host: &str) -> String { - host.split('.') - .map(|sub| { - format!( - "xn--{}", - unic_idna_punycode::encode_str(sub) - .unwrap_or_else(|| "[punycode encode failed]".to_owned()) - ) - }) - .collect::>() - .join(".") -} - -fn is_puny(host: &str) -> bool { - for ch in host.chars() { - if !(is_alphanum_or_hyphen_minus(ch) || ch == '.') { - return true; - } - } - false -} - -fn get_puny_code_warning(link: &str, host: &str) -> Option { - if is_puny(host) { - let ascii_hostname = punycode_encode(host); - Some(PunycodeWarning { - original_hostname: host.to_owned(), - ascii_hostname: ascii_hostname.to_owned(), - punycode_encoded_url: link.replacen(host, &ascii_hostname, 1) - }) - } else { - None - } -} - -fn parse_iri(input: &str) -> IResult<&str, Element, CustomError<&str>> { - let input_ = <&str>::clone(&input); - let (input, scheme) = scheme(input)?; - let (input, (ihier, host, is_ipv6_or_future)) = ihier_part(input)?; - let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?; - let (input_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; - let query = query.unwrap_or(""); - let fragment = fragment.unwrap_or(""); - let len = scheme.len() + ihier.len() + query.len() + fragment.len(); - let link = &input_[0..len]; - Ok(( - input, - Element::Link { - destination: LinkDestination { - target: link, - hostname: if host.len() == 0 { None } else { Some(host) }, - punycode: if is_ipv6_or_future { None } else { get_puny_code_warning(link, host) } , - scheme, - }, - }, - )) -} - -fn parse_irelative_ref(input: &str) -> IResult<&str, Element, CustomError<&str>> { - todo!() -} - -pub fn parse_link(input: &str) -> IResult<&str, Element, CustomError<&str>> { - /* - match parse_iri(input) { - Ok((input, iri)) => Ok((input, iri)), - Err(..) => parse_irelative_ref(input), - }*/ - parse_iri(input) -} diff --git a/src/parser/parse_from_text/mod.rs b/src/parser/parse_from_text/mod.rs index ed5e7fd..3ddaa2a 100644 --- a/src/parser/parse_from_text/mod.rs +++ b/src/parser/parse_from_text/mod.rs @@ -2,7 +2,7 @@ use super::Element; pub(crate) mod base_parsers; mod desktop_subset; -mod find_range; +pub mod find_range; pub mod hashtag_content_char_ranges; mod link_element; mod markdown_elements; diff --git a/tests/text_to_ast/links.rs b/tests/text_to_ast/links.rs new file mode 100644 index 0000000..38b3656 --- /dev/null +++ b/tests/text_to_ast/links.rs @@ -0,0 +1,26 @@ +use super::*; +use deltachat_message_parser::parser::{parse_link, LinkDestination}; + +#[test] +fn link() { + let test_cases = vec![ + "http://delta.chat", + "http://delta.chat:8080", + "http://localhost", + "http://127.0.0.0", + "https://delta.chat", + "ftp://delta.chat", + "https://delta.chat/en/help", + "https://delta.chat/en/help?hi=5&e=4", + "https://delta.chat?hi=5&e=4", + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "https://delta#section2.0", + "http://delta.chat:8080?hi=5&e=4#section2.0", + "http://delta.chat:8080#section2.0", + "mailto:delta@example.com", + "mailto:delta@example.com?subject=hi&body=hello%20world", + "mailto:foö@ü.chat", + "https://ü.app#help", // TODO add more urls for testing + ]; + +} From 1ee692cff9734422129fcd1f4e0d9a1c47cd7b27 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sat, 16 Mar 2024 16:45:06 +0330 Subject: [PATCH 22/74] refactor nearly complete --- src/parser/link_url.rs | 198 +++++++++--------- src/parser/parse_from_text/link_element.rs | 1 - .../parse_from_text/markdown_elements.rs | 7 +- src/parser/parse_from_text/mod.rs | 1 - src/parser/parse_from_text/text_elements.rs | 7 +- 5 files changed, 106 insertions(+), 108 deletions(-) delete mode 100644 src/parser/parse_from_text/link_element.rs diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 8706abb..d7a1464 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -74,19 +74,18 @@ impl LinkDestination<'_> { pub(crate) fn parse_standalone_with_whitelist( input: &str, ) -> IResult<&str, LinkDestination, CustomError<&str>> { - if let Ok((rest, link)) = parse_link(input) { - if link.destination.hostname == "" { - // if it's a generic url like tel:+989164364485 - if !is_allowed_generic_scheme(scheme) { + if let Ok((rest, link_destination)) = parse_link(input) { + if link_destination.hostname.is_none() { + // if it's a generic url like geo:-15.5,41.1 + if !is_allowed_generic_scheme(link_destination.scheme) { Err(nom::Err::Error(CustomError::InvalidLink)) } else { - Ok((rest, link.destination)) + Ok((rest, link_destination)) } } else { Ok(( - Some(link.destination.hostname), - Some(link.destination.punycode), - link.destination.scheme + rest, + link_destination )) } } else { @@ -95,10 +94,10 @@ impl LinkDestination<'_> { } pub fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { - if let Ok((rest, link_element))) = parse_link(input) { + if let Ok((rest, link_destination)) = parse_link(input) { Ok(( rest, - link_element.destination + link_destination )) } else { Err(nom::Err::Error(CustomError::InvalidLink)) @@ -106,19 +105,6 @@ impl LinkDestination<'_> { } } -#[derive(Debug, PartialEq)] -enum UrlInfo<'a> { - /// wether url is an Common Internet Scheme URL (if it has `://`) - CommonInternetSchemeURL { - has_puny_code_in_host_name: bool, - hostname: &'a str, - ascii_hostname: String, - scheme: &'a str, - }, - GenericUrl { - scheme: &'a str, - }, -} #[derive(Debug, PartialEq, Eq)] pub enum LinkParseError { @@ -477,7 +463,7 @@ fn get_puny_code_warning(link: &str, host: &str) -> Option { } // IRI links per RFC3987 and RFC3986 -fn parse_iri(input: &str) -> IResult<&str, Element, CustomError<&str>> { +fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let input_ = <&str>::clone(&input); let (input, scheme) = scheme(input)?; let (input, (ihier, host, is_ipv6_or_future)) = ihier_part(input)?; @@ -489,17 +475,15 @@ fn parse_iri(input: &str) -> IResult<&str, Element, CustomError<&str>> { let link = &input_[0..len]; Ok(( input, - Element::Link { - destination: LinkDestination { - target: link, - hostname: if host.len() == 0 { None } else { Some(host) }, - punycode: if is_ipv6_or_future { - None - } else { - get_puny_code_warning(link, host) - }, - scheme, + LinkDestination { + target: link, + hostname: if host.len() == 0 { None } else { Some(host) }, + punycode: if is_ipv6_or_future { + None + } else { + get_puny_code_warning(link, host) }, + scheme, }, )) } @@ -512,22 +496,20 @@ fn parse_irelative_ref(input: &str) -> IResult<&str, Element, CustomError<&str>> // White listed links in this format: scheme:some_char like tel:+989164364485 -fn parse_generic(input: &str) -> IResult<&str, Element, CustomError<&str>> { +fn parse_generic(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (input, scheme) = scheme(input)?; let (input, target) = take_while(is_not_white_space)(input)?; - Ok((input, Element::Link { - destination: LinkDestination { - scheme, - target, - hostname: None, - punycode: None, - } + Ok((input, LinkDestination { + scheme, + target, + hostname: None, + punycode: None, })) } -pub fn parse_link(input: &str) -> IResult<&str, Element, CustomError<&str>> { +pub fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { /* match parse_iri(input) { Ok((input, iri)) => Ok((input, iri)), @@ -544,11 +526,11 @@ pub fn parse_link(input: &str) -> IResult<&str, Element, CustomError<&str>> { #[cfg(test)] mod test { #![allow(clippy::unwrap_used)] - use crate::parser::link_url::{parse_url, punycode_encode, UrlInfo}; + use crate::parser::link_url::{parse_link, punycode_encode, PunycodeWarning, LinkDestination}; #[test] fn basic_parsing() { - let test_cases = vec![ + let test_cases_no_puny = vec![ "http://delta.chat", "http://delta.chat:8080", "http://localhost", @@ -567,18 +549,30 @@ mod test { "mailto:delta@example.com", "mailto:delta@example.com?subject=hi&body=hello%20world", "mailto:foö@ü.chat", - "https://ü.app#help", "ftp://test-test", + ]; + + let test_cases_with_puny = vec![ + "https://ü.app#help", "http://münchen.de", ]; - for input in &test_cases { + for input in &test_cases_no_puny { // println!("testing {}", input); - let (rest, (url, _)) = parse_url(input).unwrap(); + let (rest, link_destination) = parse_link(input).unwrap(); + + assert_eq!(input, &link_destination.target); + assert_eq!(rest.len(), 0); + assert!(link_destination.punycode.is_none()); + } + + for input in &test_cases_with_puny { + let (rest, link_destination) = parse_link(input).unwrap(); - assert_eq!(input, &url); + assert!(link_destination.punycode.is_some()); assert_eq!(rest.len(), 0); + assert_eq!(input, &link_destination.target); } } @@ -588,7 +582,7 @@ mod test { for input in &test_cases { // println!("testing {}", input); - assert!(parse_url(input).is_err()); + assert!(parse_link(input).is_err()); } } #[test] @@ -599,55 +593,53 @@ mod test { #[test] fn punycode_detection() { assert_eq!( - parse_url("http://münchen.de").unwrap().1, - ( - "http://münchen.de", - UrlInfo::CommonInternetSchemeURL { - hostname: "münchen.de", - has_puny_code_in_host_name: true, - ascii_hostname: "xn--mnchen-3ya.de".to_owned(), - scheme: "http" - } - ) + parse_link("http://münchen.de").unwrap().1, + LinkDestination { + hostname: Some("münchen.de"), + target: "http://münchen.de", + scheme: "http", + punycode: Some(PunycodeWarning { + original_hostname: "münchen.de".to_owned(), + punycode_encoded_url: "xn--mnchen-3ya.de".to_owned(), + ascii_hostname: "muenchen.de".to_owned(), + }), + } ); assert_eq!( - parse_url("http://muenchen.de").unwrap().1, - ( - "http://muenchen.de", - UrlInfo::CommonInternetSchemeURL { - hostname: "muenchen.de", - has_puny_code_in_host_name: false, - ascii_hostname: "muenchen.de".to_owned(), - scheme: "http" - } - ) + parse_link("http://muenchen.de").unwrap().1, + LinkDestination { + hostname: Some("muenchen.de"), + target: "http://muenchen.de", + scheme: "http", + punycode: None, + } ); } #[test] fn common_schemes() { assert_eq!( - parse_url("http://delta.chat").unwrap().1, + parse_link("http://delta.chat").unwrap(), ( - "http://delta.chat", - UrlInfo::CommonInternetSchemeURL { - hostname: "delta.chat", - has_puny_code_in_host_name: false, - ascii_hostname: "delta.chat".to_owned(), - scheme: "http" + "", + LinkDestination { + hostname: Some("delta.chat"), + target: "http://delta.chat", + scheme: "http", + punycode: None, } ) ); assert_eq!( - parse_url("https://delta.chat").unwrap().1, + parse_link("https://far.chickenkiller.com").unwrap(), ( - "https://delta.chat", - UrlInfo::CommonInternetSchemeURL { - hostname: "delta.chat", - has_puny_code_in_host_name: false, - ascii_hostname: "delta.chat".to_owned(), - scheme: "https" + "", + LinkDestination { + hostname: Some("far.chickenkiller.com"), + target: "https://far.chickenkiller.com", + scheme: "https", + punycode: None, } ) ); @@ -655,27 +647,37 @@ mod test { #[test] fn generic_schemes() { assert_eq!( - parse_url("mailto:someone@example.com").unwrap().1, + parse_link("mailto:someone@example.com").unwrap(), ( - "mailto:someone@example.com", - UrlInfo::GenericUrl { scheme: "mailto" } + "", + LinkDestination { + hostname: None, + scheme: "mailto", + punycode: None, + target: "mailto:someone@example.com" + } + ) ); assert_eq!( - parse_url("bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka") + parse_link("bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka") .unwrap() .1, - ( - "bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka", - UrlInfo::GenericUrl { scheme: "bitcoin" } - ) - ); + LinkDestination { + hostname: None, + scheme: "bitcoin", + target: "bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka", + punycode: None, + } + ); assert_eq!( - parse_url("geo:37.786971,-122.399677").unwrap().1, - ( - "geo:37.786971,-122.399677", - UrlInfo::GenericUrl { scheme: "geo" } - ) + parse_link("geo:37.786971,-122.399677").unwrap().1, + LinkDestination { + scheme: "geo", + punycode: None, + target: "geo:37.786971,-122.399677", + hostname: None + } ); } } diff --git a/src/parser/parse_from_text/link_element.rs b/src/parser/parse_from_text/link_element.rs deleted file mode 100644 index 8b13789..0000000 --- a/src/parser/parse_from_text/link_element.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 7123c21..58eaf7d 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -6,8 +6,7 @@ use super::base_parsers::{ use super::text_elements::parse_text_element; use super::Element; use super::{base_parsers::*, parse_all}; -use crate::parser::link_url::LinkDestination; -use crate::parser::parse_from_text::link_element::parse_link; +use crate::parser::link_url::{LinkDestination, parse_link}; ///! nom parsers for markdown elements use nom::{ bytes::complete::{is_not, tag, take, take_while}, @@ -111,11 +110,11 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< return Err(nom::Err::Failure(CustomError::)); } };*/ - let (rest, link) = parse_link(input)?; + let (rest, destination) = parse_link(input)?; if !rest.is_empty() { return Err(nom::Err::Error(CustomError::UnexpectedContent)); } - Ok((input, link)) + Ok((input, Element::Link { destination })) } // [labeled](https://link) diff --git a/src/parser/parse_from_text/mod.rs b/src/parser/parse_from_text/mod.rs index 3ddaa2a..4796b1d 100644 --- a/src/parser/parse_from_text/mod.rs +++ b/src/parser/parse_from_text/mod.rs @@ -4,7 +4,6 @@ pub(crate) mod base_parsers; mod desktop_subset; pub mod find_range; pub mod hashtag_content_char_ranges; -mod link_element; mod markdown_elements; mod text_elements; diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 27e5b97..49259d5 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -1,10 +1,9 @@ ///! nom parsers for text elements -use crate::parser::link_url::LinkDestination; +use crate::parser::link_url::{LinkDestination, parse_link}; use super::base_parsers::CustomError; use super::base_parsers::*; use super::hashtag_content_char_ranges::hashtag_content_char; -use super::link_element::parse_link; use super::Element; use crate::nom::{Offset, Slice}; use nom::bytes::complete::take_while; @@ -280,8 +279,8 @@ pub(crate) fn parse_text_element( Ok((i, elm)) } else if let Ok((i, elm)) = email_address(input) { Ok((i, elm)) - } else if let Ok((i, elm)) = parse_link(input) { - Ok((i, elm)) + } else if let Ok((i, destination)) = parse_link(input) { + Ok((i, Element::Link { destination })) } else if let Ok((i, _)) = linebreak(input) { Ok((i, Element::Linebreak)) } else { From d0e9acefed9a82a1a5429d5b0874559eea56752b Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sat, 16 Mar 2024 17:04:29 +0330 Subject: [PATCH 23/74] reduce number of stacks required by inlining ihier part --- src/parser/link_url.rs | 56 ++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index d7a1464..94715fb 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -192,10 +192,6 @@ fn is_scheme(c: char) -> bool { is_alpha(c) || is_digit(c) || is_scheme(c) } -fn is_ipv4(c: char) -> bool { - is_digit(c) || c == '.' -} - fn ipv4(input: &str) -> IResult<&str, &str, CustomError<&str>> { let (input, ipv4_) = recognize(tuple((u8, char('.'), u8, char('.'), u8, char('.'), u8)))(input)?; @@ -316,10 +312,6 @@ fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> { ))(input) } -fn is_userinfo_not_pct_encoded(c: char) -> bool { - is_iunreserved(c) || is_sub_delim(c) -} - fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> /* (iauthority, host, bool) */ { let i = <&str>::clone(&input); @@ -343,29 +335,6 @@ fn is_iuserinfo_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) || c == ':' } -fn ihier_part(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> { - let i = <&str>::clone(&input); - let (input, _double_slash) = tag("//")(input)?; - let (input, (authority, host, is_ipv6_or_future)) = iauthority(input)?; - let (input, path) = opt(alt(( - recognize(tuple(( - char('/'), - opt(tuple(( - take_while_ipchar1, - many0(tuple((char('/'), take_while_ipchar))), - ))), - ))), // ipath-absolute - recognize(tuple(( - take_while_ipchar, - many0(tuple((char('/'), take_while_ipchar))), - ))), // ipath_rootless - )))(input)?; - let path = path.unwrap_or(""); // it's ipath_empty - let len = 2 + authority.len() + path.len(); - // 2 is for double_slash - Ok((input, (&i[0..len], host, is_ipv6_or_future))) -} - fn is_ipchar_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) || matches!(c, ':' | '@') } @@ -466,12 +435,28 @@ fn get_puny_code_warning(link: &str, host: &str) -> Option { fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let input_ = <&str>::clone(&input); let (input, scheme) = scheme(input)?; - let (input, (ihier, host, is_ipv6_or_future)) = ihier_part(input)?; + let (input, _double_slash) = tag("//")(input)?; + let (input, (authority, host, is_ipv6_or_future)) = iauthority(input)?; + let (input, path) = opt(alt(( + recognize(tuple(( + char('/'), + opt(tuple(( + take_while_ipchar1, + many0(tuple((char('/'), take_while_ipchar))), + ))), + ))), // ipath-absolute + recognize(tuple(( + take_while_ipchar, + many0(tuple((char('/'), take_while_ipchar))), + ))), // ipath-rootless + )))(input)?; + let path = path.unwrap_or(""); // it's ipath-empty let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?; let (input_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; let query = query.unwrap_or(""); let fragment = fragment.unwrap_or(""); - let len = scheme.len() + ihier.len() + query.len() + fragment.len(); + let ihier_len = 2 + authority.len() + host.len() + path.len(); + let len = scheme.len() + ihier_len + query.len() + fragment.len(); let link = &input_[0..len]; Ok(( input, @@ -488,16 +473,17 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { )) } - +/* // For future fn parse_irelative_ref(input: &str) -> IResult<&str, Element, CustomError<&str>> { todo!() } - +*/ // White listed links in this format: scheme:some_char like tel:+989164364485 fn parse_generic(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (input, scheme) = scheme(input)?; + let (input, target) = take_while(is_not_white_space)(input)?; Ok((input, LinkDestination { From 9d05d8ac4980d9a2a8faa10af289cbe99e57805d Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sat, 16 Mar 2024 17:20:50 +0330 Subject: [PATCH 24/74] update --- src/parser/link_url.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 94715fb..944a4f5 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -483,7 +483,9 @@ fn parse_irelative_ref(input: &str) -> IResult<&str, Element, CustomError<&str>> // White listed links in this format: scheme:some_char like tel:+989164364485 fn parse_generic(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (input, scheme) = scheme(input)?; - + if !is_allowed_generic_scheme(scheme) { + return Err(nom::Err::Error(CustomError::InvalidLink)); + } let (input, target) = take_while(is_not_white_space)(input)?; Ok((input, LinkDestination { @@ -544,7 +546,7 @@ mod test { ]; for input in &test_cases_no_puny { - // println!("testing {}", input); + println!("testing {input}"); let (rest, link_destination) = parse_link(input).unwrap(); @@ -554,6 +556,7 @@ mod test { } for input in &test_cases_with_puny { + println!("testing {input}"); let (rest, link_destination) = parse_link(input).unwrap(); assert!(link_destination.punycode.is_some()); @@ -567,7 +570,7 @@ mod test { let test_cases = vec![";?:/hi", "##://thing"]; for input in &test_cases { - // println!("testing {}", input); + println!("testing {input}"); assert!(parse_link(input).is_err()); } } From 54a9bdef8421c3f2d73bf690a8b5e4566614b1b3 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 17 Mar 2024 16:31:55 +0330 Subject: [PATCH 25/74] fix little bug --- src/parser/link_url.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 944a4f5..2352ca2 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -385,7 +385,7 @@ fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { } fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit))))(input) + recognize(many0(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit)))))(input) } fn punycode_encode(host: &str) -> String { From d024101074c5e70bcadc89b0df4e97773fe5ccbf Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 17 Mar 2024 18:09:42 +0330 Subject: [PATCH 26/74] fix loop... --- src/parser/link_url.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 2352ca2..cd1f99f 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -189,7 +189,11 @@ fn is_sub_delim(c: char) -> bool { // Here again, order is important. As URLs/IRIs have letters in them // most of the time and less digits or other characters. --Farooq fn is_scheme(c: char) -> bool { - is_alpha(c) || is_digit(c) || is_scheme(c) + is_alpha(c) || is_digit(c) || is_other_scheme(c) +} + +fn is_other_scheme(c: char) -> bool { + matches!(c, '+' | '-' | '.') } fn ipv4(input: &str) -> IResult<&str, &str, CustomError<&str>> { @@ -381,7 +385,7 @@ fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { } fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { - take_while(is_scheme)(input) + recognize(tuple((take_while_m_n(1, 1, is_alpha), take_while(is_scheme)))(input) } fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { From ffe9d6ee378ac6d3f4316fcdebfe37c11edca0f0 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 2 Apr 2024 15:40:02 +0330 Subject: [PATCH 27/74] remove not_link_part_char fn --- src/parser/parse_from_text/text_elements.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 49259d5..0bc4838 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -95,11 +95,11 @@ pub(crate) fn email_address(input: &str) -> IResult<&str, Element, CustomError<& } } +/* fn not_link_part_char(c: char) -> bool { !matches!(c, ':' | '\n' | '\r' | '\t' | ' ') } -/* fn link(input: &str) -> IResult<&str, (), CustomError<&str>> { let (input, _) = take_while1(link_scheme)(input)?; } From e36f1de4075a679f4f9c15bc18e4df33677783a8 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 2 Apr 2024 16:35:06 +0330 Subject: [PATCH 28/74] fix clippy issues --- src/parser/link_url.rs | 106 ++++++++++--------- src/parser/parse_from_text/desktop_subset.rs | 5 +- src/parser/parse_from_text/find_range.rs | 3 +- src/parser/parse_from_text/text_elements.rs | 3 +- 4 files changed, 58 insertions(+), 59 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index cd1f99f..0e4dfc5 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -8,10 +8,9 @@ use nom::{ error::{ErrorKind, ParseError}, multi::{count, many0, many1, many_m_n}, sequence::{tuple, delimited}, - AsChar, IResult, + IResult, }; -use super::Element; use super::parse_from_text::{ base_parsers::{is_not_white_space, CustomError}, find_range::is_in_one_of_ranges, @@ -71,6 +70,7 @@ fn is_allowed_generic_scheme(scheme: &str) -> bool { impl LinkDestination<'_> { /// parse a link that is not in a delimited link or a labled link, just a part of normal text /// it has a whitelist of schemes, because otherwise + /* pub(crate) fn parse_standalone_with_whitelist( input: &str, ) -> IResult<&str, LinkDestination, CustomError<&str>> { @@ -92,7 +92,7 @@ impl LinkDestination<'_> { Err(nom::Err::Error(CustomError::InvalidLink)) } } - +*/ pub fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { if let Ok((rest, link_destination)) = parse_link(input) { Ok(( @@ -122,14 +122,6 @@ impl ParseError for LinkParseError { } } -fn is_alphanum_or_hyphen_minus(char: char) -> bool { - match char { - '-' => true, - _ => char.is_alphanum(), - } -} - - fn is_alpha(c: char) -> bool { c.is_alphabetic() } @@ -139,7 +131,7 @@ fn is_hex_digit(c: char) -> bool { } fn is_digit(c: char) -> bool { - c.is_digit(10) + c.is_ascii_digit() } // These ranges have been extracted from RFC3987, Page 8. @@ -176,7 +168,7 @@ fn is_iunreserved(c: char) -> bool { } fn is_other_unreserved(c: char) -> bool { - matches!(c, '_' | '.' | '_' | '~') + matches!(c, '_' | '.' | '-' | '~') } fn is_sub_delim(c: char) -> bool { @@ -324,8 +316,12 @@ fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str let (input, port) = opt(recognize(tuple((char(':'), take_while(is_digit)))))(input)?; let userinfo = userinfo.unwrap_or(""); let port = port.unwrap_or(""); - let len = userinfo.len() + host.len() + port.len(); - Ok((input, (&i[0..len], host, is_ipv6_or_future))) + let len = userinfo.len().saturating_add(host.len()).saturating_add(port.len()); + if let Some(out) = i.get(0..len) { + Ok((input, (out, host, is_ipv6_or_future))) + } else { + Err(nom::Err::Failure(CustomError::NoContent)) + } } fn take_while_iuserinfo(input: &str) -> IResult<&str, &str, CustomError<&str>> { @@ -385,7 +381,16 @@ fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { } fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple((take_while_m_n(1, 1, is_alpha), take_while(is_scheme)))(input) + let i = <&str>::clone(&input); + let (input, _first) = take_while_m_n(1, 1, is_alpha)(input)?; + let (input, second) = take_while(is_scheme)(input)?; + let len = 1usize.saturating_add(second.len()); + // "1" is for the first, its length is always 1 + if let Some(out) = i.get(0..len) { + Ok((input, out)) + } else { + Err(nom::Err::Failure(CustomError::NoContent)) + } } fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { @@ -395,12 +400,7 @@ fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> fn punycode_encode(host: &str) -> String { host.split('.') .map(|sub| { - let has_non_ascii_char: bool = sub - .chars() - .map(|ch| is_alphanum_or_hyphen_minus(ch)) - .reduce(|acc, e| e && acc) - .unwrap_or(false); - if has_non_ascii_char { + if is_puny(sub) { format!( "xn--{}", unic_idna_punycode::encode_str(sub) @@ -415,7 +415,7 @@ fn punycode_encode(host: &str) -> String { } fn is_puny(host: &str) -> bool { for ch in host.chars() { - if !(is_alphanum_or_hyphen_minus(ch) || ch == '.') { + if !(ch.is_ascii_alphanumeric() || ch == '.') { return true; } } @@ -459,22 +459,24 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (input_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; let query = query.unwrap_or(""); let fragment = fragment.unwrap_or(""); - let ihier_len = 2 + authority.len() + host.len() + path.len(); - let len = scheme.len() + ihier_len + query.len() + fragment.len(); - let link = &input_[0..len]; - Ok(( - input, - LinkDestination { - target: link, - hostname: if host.len() == 0 { None } else { Some(host) }, - punycode: if is_ipv6_or_future { - None - } else { - get_puny_code_warning(link, host) + let ihier_len = 2usize.saturating_add(authority.len()).saturating_add(host.len()).saturating_add(path.len()); + let len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); + if let Some(link) = input_.get(0..len) { + return Ok(( + input, + LinkDestination { + target: link, + hostname: if host.is_empty() { None } else { Some(host) }, + punycode: if is_ipv6_or_future { + None + } else { + get_puny_code_warning(link, host) + }, + scheme, }, - scheme, - }, - )) + )); + } + Err(nom::Err::Failure(CustomError::NoContent)) } /* @@ -486,21 +488,24 @@ fn parse_irelative_ref(input: &str) -> IResult<&str, Element, CustomError<&str>> // White listed links in this format: scheme:some_char like tel:+989164364485 fn parse_generic(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { + let i = <&str>::clone(&input); let (input, scheme) = scheme(input)?; if !is_allowed_generic_scheme(scheme) { return Err(nom::Err::Error(CustomError::InvalidLink)); } - let (input, target) = take_while(is_not_white_space)(input)?; - - Ok((input, LinkDestination { - scheme, - target, - hostname: None, - punycode: None, - })) + let (input, rest) = take_while(is_not_white_space)(input)?; + let len = scheme.len().saturating_add(rest.len()); + if let Some(target) = i.get(0..len) { + return Ok((input, LinkDestination { + scheme, + target, + hostname: None, + punycode: None, + })); + } + Err(nom::Err::Failure(CustomError::NoContent)) } - pub fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { /* match parse_iri(input) { @@ -550,9 +555,7 @@ mod test { ]; for input in &test_cases_no_puny { - println!("testing {input}"); - - let (rest, link_destination) = parse_link(input).unwrap(); + let (rest, link_destination) = parse_link(input).expect("Test failed: {input}"); assert_eq!(input, &link_destination.target); assert_eq!(rest.len(), 0); @@ -560,8 +563,7 @@ mod test { } for input in &test_cases_with_puny { - println!("testing {input}"); - let (rest, link_destination) = parse_link(input).unwrap(); + let (rest, link_destination) = parse_link(input).expect("Test failed: {input}"); assert!(link_destination.punycode.is_some()); assert_eq!(rest.len(), 0); diff --git a/src/parser/parse_from_text/desktop_subset.rs b/src/parser/parse_from_text/desktop_subset.rs index 438f0db..fe25f38 100644 --- a/src/parser/parse_from_text/desktop_subset.rs +++ b/src/parser/parse_from_text/desktop_subset.rs @@ -1,10 +1,7 @@ //! desktop subset of markdown, becase this way we can already use the punycode detection of this crate //! and also we can keep delimited and labled links in desktop -use super::base_parsers::*; -use super::base_parsers::{ - direct_delimited, is_white_space, is_white_space_but_not_linebreak, CustomError, -}; +use super::base_parsers::CustomError; use super::markdown_elements::{delimited_email_address, delimited_link, labeled_link}; use super::text_elements::parse_text_element; use super::Element; diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs index 3090ed5..20e410e 100644 --- a/src/parser/parse_from_text/find_range.rs +++ b/src/parser/parse_from_text/find_range.rs @@ -20,11 +20,12 @@ enum FindRangeResult<'a> { /// - `code` the u32 to look for a range for. /// /// - `ranges` a refernce to a slice of `RangeInclusive` -fn find_range_for_char<'a>(code: u32, ranges: &'a [RangeInclusive]) -> FindRangeResult<'a> { +fn find_range_for_char(code: u32, ranges: &'_ [RangeInclusive]) -> FindRangeResult<'_> { let index = ranges.binary_search_by_key(&code, |range| *range.start()); match index { Ok(_) => FindRangeResult::WasOnRangeStart, Err(index) => match index { + #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] 0 => FindRangeResult::Range(&ranges[0]), // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 0bc4838..50d9b21 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -1,8 +1,7 @@ ///! nom parsers for text elements -use crate::parser::link_url::{LinkDestination, parse_link}; +use crate::parser::link_url::parse_link; use super::base_parsers::CustomError; -use super::base_parsers::*; use super::hashtag_content_char_ranges::hashtag_content_char; use super::Element; use crate::nom::{Offset, Slice}; From d6fae27926bf8414792fe2cb7d15b478c6da268d Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 2 Apr 2024 17:01:23 +0330 Subject: [PATCH 29/74] some more fixes --- src/parser/link_url.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 0e4dfc5..3af50f9 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -439,7 +439,7 @@ fn get_puny_code_warning(link: &str, host: &str) -> Option { fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let input_ = <&str>::clone(&input); let (input, scheme) = scheme(input)?; - let (input, _double_slash) = tag("//")(input)?; + let (input, _period_double_slash) = tag("://")(input)?; let (input, (authority, host, is_ipv6_or_future)) = iauthority(input)?; let (input, path) = opt(alt(( recognize(tuple(( @@ -456,10 +456,10 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { )))(input)?; let path = path.unwrap_or(""); // it's ipath-empty let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?; - let (input_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; + let (input, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; let query = query.unwrap_or(""); let fragment = fragment.unwrap_or(""); - let ihier_len = 2usize.saturating_add(authority.len()).saturating_add(host.len()).saturating_add(path.len()); + let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(path.len()); let len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); if let Some(link) = input_.get(0..len) { return Ok(( From d6b7dbc0777177627584ca54f1f6b8b7569b3875 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 2 Apr 2024 17:55:19 +0330 Subject: [PATCH 30/74] correction and improvements --- src/parser/link_url.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 3af50f9..0c20e43 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -164,7 +164,7 @@ fn is_unreserved(c: char) -> bool { } fn is_iunreserved(c: char) -> bool { - is_ucschar(c) || is_unreserved(c) + is_unreserved(c) || is_ucschar(c) } fn is_other_unreserved(c: char) -> bool { @@ -226,7 +226,7 @@ fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple(( opt(h16), double_period, - many_m_n(4, 4, h16_and_period), + count(h16_and_period, 4), ls32, ))), recognize(tuple(( @@ -278,7 +278,7 @@ fn ipvfuture(input: &str) -> IResult<&str, &str, CustomError<&str>> { } fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { - delimited(char('['), alt((ipv6, ipvfuture)), char(']'))(input) + recognize(tuple(((char('['), alt((ipv6, ipvfuture)), char(']')))))(input) } /// Parse host @@ -459,6 +459,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (input, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; let query = query.unwrap_or(""); let fragment = fragment.unwrap_or(""); + println!("SCH: {}, AUTH: {}, P: {}, Q: {}, F: {}", scheme.len(), authority.len(), path.len(), query.len(), fragment.len()); let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(path.len()); let len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); if let Some(link) = input_.get(0..len) { @@ -595,8 +596,8 @@ mod test { scheme: "http", punycode: Some(PunycodeWarning { original_hostname: "münchen.de".to_owned(), - punycode_encoded_url: "xn--mnchen-3ya.de".to_owned(), - ascii_hostname: "muenchen.de".to_owned(), + ascii_hostname: "xn--mnchen-3ya.de".to_owned(), + punycode_encoded_url: "http://xn--mnchen-3ya.de".to_owned(), }), } ); From df2842df83e326377c4ccef01606b6ecf7fc8265 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Wed, 3 Apr 2024 15:48:52 +0330 Subject: [PATCH 31/74] passing some more tests --- src/parser/link_url.rs | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 0c20e43..f2fff3c 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -2,7 +2,7 @@ use std::ops::RangeInclusive; use nom::{ branch::alt, - bytes::complete::{tag, take_while, take_while_m_n}, + bytes::complete::{tag, take_while, take_while1, take_while_m_n}, character::complete::{char, u8}, combinator::{opt, recognize}, error::{ErrorKind, ParseError}, @@ -302,10 +302,10 @@ fn parse_host(input: &str) -> IResult<&str, (&str, bool), CustomError<&str>> { } fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> { - alt(( - recognize(many0(take_while_pct_encoded)), - take_while(is_ireg_name_not_pct_encoded), - ))(input) + recognize(many0(alt(( + recognize(many1(take_while_pct_encoded)), + take_while1(is_ireg_name_not_pct_encoded), + ))))(input) } fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> /* (iauthority, host, bool) */ @@ -348,7 +348,7 @@ fn take_while_ipchar(input: &str) -> IResult<&str, &str, CustomError<&str>> { fn take_while_ipchar1(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many1(alt(( - take_while(is_ipchar_not_pct_encoded), + take_while1(is_ipchar_not_pct_encoded), take_while_pct_encoded, ))))(input) } @@ -366,15 +366,14 @@ fn is_iquery_not_pct_encoded(c: char) -> bool { fn iquery(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(alt(( - take_while(is_iquery_not_pct_encoded), + take_while1(is_iquery_not_pct_encoded), take_while_pct_encoded, ))))(input) } fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(alt(( - take_while_ipchar, - take_while_pct_encoded, + take_while_ipchar1, tag("/"), tag("?"), ))))(input) @@ -394,7 +393,7 @@ fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { } fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(many0(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit)))))(input) + recognize(many1(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit)))))(input) } fn punycode_encode(host: &str) -> String { @@ -446,7 +445,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { char('/'), opt(tuple(( take_while_ipchar1, - many0(tuple((char('/'), take_while_ipchar))), + many0(tuple((char('/'), take_while_ipchar1))), ))), ))), // ipath-absolute recognize(tuple(( @@ -556,7 +555,7 @@ mod test { ]; for input in &test_cases_no_puny { - let (rest, link_destination) = parse_link(input).expect("Test failed: {input}"); + let (rest, link_destination) = parse_link(input).expect(&format!("Test failed: {input}")); assert_eq!(input, &link_destination.target); assert_eq!(rest.len(), 0); From c08fb66191b1208fd1ec38fd24f3fa5a7466108b Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Wed, 3 Apr 2024 15:52:33 +0330 Subject: [PATCH 32/74] passing some more more tests --- src/parser/link_url.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index f2fff3c..908b9e1 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -414,7 +414,8 @@ fn punycode_encode(host: &str) -> String { } fn is_puny(host: &str) -> bool { for ch in host.chars() { - if !(ch.is_ascii_alphanumeric() || ch == '.') { + if !(ch.is_ascii_alphanumeric() || matches!(ch, '.' | '-')) { + println!("IT IS! {host}"); return true; } } From 124764cfdff6ec45bb4b0ce3a2b79e7215af902f Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Thu, 4 Apr 2024 10:36:09 +0330 Subject: [PATCH 33/74] some modifications --- src/parser/link_url.rs | 54 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 908b9e1..ab4ebb6 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -2,6 +2,7 @@ use std::ops::RangeInclusive; use nom::{ branch::alt, + Slice, bytes::complete::{tag, take_while, take_while1, take_while_m_n}, character::complete::{char, u8}, combinator::{opt, recognize}, @@ -278,7 +279,7 @@ fn ipvfuture(input: &str) -> IResult<&str, &str, CustomError<&str>> { } fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple(((char('['), alt((ipv6, ipvfuture)), char(']')))))(input) + recognize(tuple((char('['), alt((ipv6, ipvfuture)), char(']'))))(input) } /// Parse host @@ -462,7 +463,56 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { println!("SCH: {}, AUTH: {}, P: {}, Q: {}, F: {}", scheme.len(), authority.len(), path.len(), query.len(), fragment.len()); let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(path.len()); let len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); - if let Some(link) = input_.get(0..len) { + if let Some(mut link) = input_.get(0..len) { + if link.ends_with([':', ';', '.', ',']) { + link = link.slice(..len-1); + } + type Stack = Vec; + let mut parenthes: Stack = vec![]; // () + let mut curly_bracket: Stack = vec![]; // {} + let mut bracket: Stack = vec![]; // [] + let mut angle: Stack = vec![]; // <> + let mut alternative_offset: Option = None; + for (i, ch) in link.chars().enumerate() { + match ch { + '(' => { + parenthes.push(true); + } + ')' => { + if parenthes.pop().is_none() { + alternative_offset = Some(i); + } + } + '[' => { + bracket.push(true); + } + ']' => { + if bracket.pop().is_none() { + alternative_offset = Some(i); + } + } + '{' => { + curly_bracket.push(true); + } + '}' => { + if curly_bracket.pop().is_none() { + alternative_offset = Some(i); + } + } + '<' => { + angle.push(true); + } + '>' => { + if angle.pop().is_none() { + alternative_offset = Some(i); + } + } + _ => {} + } + } + if let Some(offset) = alternative_offset { + link = link.slice(offset..); + } return Ok(( input, LinkDestination { From 33efac2ffe5b41dcd5b1806fb0d7008c46a34484 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 8 Apr 2024 16:20:35 +0330 Subject: [PATCH 34/74] fix some more tests --- src/parser/link_url.rs | 88 +++++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index ab4ebb6..29b8d89 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -460,59 +460,85 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (input, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; let query = query.unwrap_or(""); let fragment = fragment.unwrap_or(""); - println!("SCH: {}, AUTH: {}, P: {}, Q: {}, F: {}", scheme.len(), authority.len(), path.len(), query.len(), fragment.len()); let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(path.len()); - let len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); - if let Some(mut link) = input_.get(0..len) { + let mut len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); + if let Some(link) = input_.get(0..len) { if link.ends_with([':', ';', '.', ',']) { - link = link.slice(..len-1); + len -= 1; } - type Stack = Vec; - let mut parenthes: Stack = vec![]; // () - let mut curly_bracket: Stack = vec![]; // {} - let mut bracket: Stack = vec![]; // [] - let mut angle: Stack = vec![]; // <> - let mut alternative_offset: Option = None; + + let mut parenthes = 0usize; // () + let mut curly_bracket = 0usize; // {} + let mut bracket = 0usize; // [] + let mut angle = 0usize; // <> + for (i, ch) in link.chars().enumerate() { match ch { '(' => { - parenthes.push(true); + parenthes = parenthes.saturating_add(1); + if link.slice(i..).find(')').is_none() { + len = i; + break; + } } - ')' => { - if parenthes.pop().is_none() { - alternative_offset = Some(i); + '{' => { + curly_bracket = curly_bracket.saturating_add(1); + if link.slice(i..).find('}').is_none() { + len = i; + break; } } '[' => { - bracket.push(true); + bracket = bracket.saturating_add(1); + if link.slice(i..).find(']').is_none() { + len = i; + break; + } } - ']' => { - if bracket.pop().is_none() { - alternative_offset = Some(i); + '<' => { + angle = angle.saturating_add(1); + if link.slice(i..).find('>').is_none() { + len = i; + break; } } - '{' => { - curly_bracket.push(true); + ')' => { + if parenthes == 0 { + len = i; + break; + } else { + parenthes = parenthes.saturating_sub(1); + } } - '}' => { - if curly_bracket.pop().is_none() { - alternative_offset = Some(i); + ']' => { + if bracket == 0 { + len = i; + break; + } else { + bracket = bracket.saturating_sub(1); } } - '<' => { - angle.push(true); + '}' => { + if curly_bracket == 0 { + len = i; + break; + } else { + curly_bracket = curly_bracket.saturating_sub(1); + } } '>' => { - if angle.pop().is_none() { - alternative_offset = Some(i); + if angle == 0 { + len = i; + break; + } else { + angle = angle.saturating_sub(1); } } - _ => {} + _ => continue, } } - if let Some(offset) = alternative_offset { - link = link.slice(offset..); - } + let link = input_.slice(0..len); + let input = input_.slice(len..); return Ok(( input, LinkDestination { From 2f4aa9aa4595ebc6b867c250da47614d76c33c43 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 8 Apr 2024 16:39:00 +0330 Subject: [PATCH 35/74] fixed another testcase --- src/parser/link_url.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 29b8d89..7529f4b 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -245,7 +245,6 @@ fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple(( opt(tuple((many_m_n(0, 3, h16_and_period), h16))), double_period, - count(h16_and_period, 1), ls32, ))), recognize(tuple(( @@ -413,10 +412,10 @@ fn punycode_encode(host: &str) -> String { .collect::>() .join(".") } + fn is_puny(host: &str) -> bool { for ch in host.chars() { if !(ch.is_ascii_alphanumeric() || matches!(ch, '.' | '-')) { - println!("IT IS! {host}"); return true; } } @@ -447,12 +446,12 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { char('/'), opt(tuple(( take_while_ipchar1, - many0(tuple((char('/'), take_while_ipchar1))), + many0(tuple((char('/'), opt(take_while_ipchar1)))), ))), ))), // ipath-absolute recognize(tuple(( take_while_ipchar, - many0(tuple((char('/'), take_while_ipchar))), + many0(tuple((char('/'), opt(take_while_ipchar1)))), ))), // ipath-rootless )))(input)?; let path = path.unwrap_or(""); // it's ipath-empty @@ -463,6 +462,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(path.len()); let mut len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); if let Some(link) = input_.get(0..len) { + println!("{link}"); if link.ends_with([':', ';', '.', ',']) { len -= 1; } @@ -539,6 +539,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { } let link = input_.slice(0..len); let input = input_.slice(len..); + println!("{link}, {input}"); return Ok(( input, LinkDestination { From 8fab2c22236b597b74d78e58d06987f560b70a5d Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 8 Apr 2024 17:31:33 +0330 Subject: [PATCH 36/74] remove wrong testcases --- tests/text_to_ast/text_only.rs | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/tests/text_to_ast/text_only.rs b/tests/text_to_ast/text_only.rs index ee03e30..e732e7f 100644 --- a/tests/text_to_ast/text_only.rs +++ b/tests/text_to_ast/text_only.rs @@ -547,38 +547,10 @@ fn link_with_different_parenthesis_in_parenthesis() { Text("()(for [example{ "), Link { destination: link_destination_for_testing( - "https://en.wikipedia.org/wiki/Bracket_(disambiguation){[}hi]" + "https://en.wikipedia.org/wiki/Bracket_(disambiguation)" ) }, - Text("])}") - ] - ); -} - -#[test] -fn link_with_backets_in_backets() { - assert_eq!( - parse_only_text("there are links that contain backets [for example https://en.wikipedia.org/wiki/Bracket_[disambiguation]]"), - vec![ - Text("there are links that contain backets [for example "), - Link { - destination: link_destination_for_testing("https://en.wikipedia.org/wiki/Bracket_[disambiguation]") - }, - Text("]") - ] - ); -} - -#[test] -fn link_with_parenthesis_in_parenthesis_curly() { - assert_eq!( - parse_only_text("there are links that contain parenthesis {for example https://en.wikipedia.org/wiki/Bracket_{disambiguation}}"), - vec![ - Text("there are links that contain parenthesis {for example "), - Link { - destination: link_destination_for_testing("https://en.wikipedia.org/wiki/Bracket_{disambiguation}") - }, - Text("}") + Text("{[}hi]])}") ] ); } From 0ae4d6ff6dc75043f5c08232ea0fdf9646f11cf6 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 9 Apr 2024 17:31:37 +0330 Subject: [PATCH 37/74] fixing more testcases --- src/parser/link_url.rs | 37 ++++++++++++++----- .../parse_from_text/markdown_elements.rs | 16 +------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 7529f4b..9f439be 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -104,6 +104,19 @@ impl LinkDestination<'_> { Err(nom::Err::Error(CustomError::InvalidLink)) } } + + pub fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { + let (mut remaining, mut link) = Self::parse(input)?; + println!("BEFORE {remaining} {link:?}"); + if let Some(first) = remaining.chars().nth(0) { + if matches!(first, ';' | '.' | ',' | ':') { + let point = link.target.len() + 1; + link.target = input.slice(..point); + remaining = input.slice(point..); + } + } + Ok((remaining, link)) + } } @@ -316,7 +329,7 @@ fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str let (input, port) = opt(recognize(tuple((char(':'), take_while(is_digit)))))(input)?; let userinfo = userinfo.unwrap_or(""); let port = port.unwrap_or(""); - let len = userinfo.len().saturating_add(host.len()).saturating_add(port.len()); + let len = userinfo.len().saturating_add(port.len()); if let Some(out) = i.get(0..len) { Ok((input, (out, host, is_ipv6_or_future))) } else { @@ -440,7 +453,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let input_ = <&str>::clone(&input); let (input, scheme) = scheme(input)?; let (input, _period_double_slash) = tag("://")(input)?; - let (input, (authority, host, is_ipv6_or_future)) = iauthority(input)?; + let (input, (authority, mut host, is_ipv6_or_future)) = iauthority(input)?; let (input, path) = opt(alt(( recognize(tuple(( char('/'), @@ -459,12 +472,14 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (input, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; let query = query.unwrap_or(""); let fragment = fragment.unwrap_or(""); - let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(path.len()); + let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(host.len()).saturating_add(path.len()); let mut len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); if let Some(link) = input_.get(0..len) { - println!("{link}"); if link.ends_with([':', ';', '.', ',']) { len -= 1; + if path.len() == 0 && query.len() == 0 && fragment.len() == 0 { + host = input_.slice(scheme.len()+3..input_.len()-1); + } } let mut parenthes = 0usize; // () @@ -507,7 +522,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { len = i; break; } else { - parenthes = parenthes.saturating_sub(1); + parenthes -= 1; } } ']' => { @@ -515,7 +530,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { len = i; break; } else { - bracket = bracket.saturating_sub(1); + bracket -= 1; } } '}' => { @@ -523,7 +538,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { len = i; break; } else { - curly_bracket = curly_bracket.saturating_sub(1); + curly_bracket -= 1; } } '>' => { @@ -531,15 +546,17 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { len = i; break; } else { - angle = angle.saturating_sub(1); + angle -= 1; } } _ => continue, } } + + let link = input_.slice(0..len); let input = input_.slice(len..); - println!("{link}, {input}"); + return Ok(( input, LinkDestination { @@ -590,7 +607,7 @@ pub fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomError<&st Ok((input, iri)) => Ok((input, iri)), Err(..) => parse_irelative_ref(input), }*/ - alt((parse_iri, parse_generic))(input) + alt((parse_generic, parse_iri))(input) } // TODO testcases diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 58eaf7d..bf49d50 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -97,19 +97,6 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< if content.is_empty() { return Err(nom::Err::Error(CustomError::NoContent)); } - /* - let (rest, link) = match link(content)?; { - Ok((rest, link)) => (rest, link), - Err(nom::Err::Error(err)) => { - return Err(nom::Err::Error(CustomError::Nom(err.input, err.code))); - }, - Err(nom::Err::Incomplete(err)) => { - return Err(nom::Err::Incomplete(err)); - }, - Err(nom::Err::Failure(err)) => { - return Err(nom::Err::Failure(CustomError::)); - } - };*/ let (rest, destination) = parse_link(input)?; if !rest.is_empty() { return Err(nom::Err::Error(CustomError::UnexpectedContent)); @@ -130,8 +117,7 @@ pub(crate) fn labeled_link(input: &str) -> IResult<&str, Element, CustomError<&s return Err(nom::Err::Error(CustomError::NoContent)); } // check if result is valid link - let (remainder, destination) = LinkDestination::parse(raw_link)?; - + let (remainder, destination) = LinkDestination::parse_labelled(raw_link)?; if remainder.is_empty() { Ok((input, Element::LabeledLink { label, destination })) } else { From b49797e9d027c3138749be53f7d2ac876fce2bbc Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 22 Apr 2024 16:14:41 +0330 Subject: [PATCH 38/74] trying to pass last testcases, 2 left --- src/parser/link_url.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index 9f439be..c7b80a4 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -110,9 +110,11 @@ impl LinkDestination<'_> { println!("BEFORE {remaining} {link:?}"); if let Some(first) = remaining.chars().nth(0) { if matches!(first, ';' | '.' | ',' | ':') { + println!("Matches!"); let point = link.target.len() + 1; link.target = input.slice(..point); remaining = input.slice(point..); + println!("{link:?} ======= {remaining}"); } } Ok((remaining, link)) From d4ab53ac0b4a144d3975c579d7ffe1204d4daa24 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 22 Apr 2024 16:39:18 +0330 Subject: [PATCH 39/74] fixed clippy issues --- src/parser/link_url.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/parser/link_url.rs b/src/parser/link_url.rs index c7b80a4..9c4c05c 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url.rs @@ -8,7 +8,7 @@ use nom::{ combinator::{opt, recognize}, error::{ErrorKind, ParseError}, multi::{count, many0, many1, many_m_n}, - sequence::{tuple, delimited}, + sequence::tuple, IResult, }; @@ -107,16 +107,15 @@ impl LinkDestination<'_> { pub fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (mut remaining, mut link) = Self::parse(input)?; - println!("BEFORE {remaining} {link:?}"); - if let Some(first) = remaining.chars().nth(0) { + if let Some(first) = remaining.chars().next() { if matches!(first, ';' | '.' | ',' | ':') { - println!("Matches!"); + #[allow(clippy::integer_arithmetic)] let point = link.target.len() + 1; link.target = input.slice(..point); remaining = input.slice(point..); - println!("{link:?} ======= {remaining}"); } } + println!("BEFORE {remaining} {link:?}"); Ok((remaining, link)) } } @@ -125,7 +124,6 @@ impl LinkDestination<'_> { #[derive(Debug, PartialEq, Eq)] pub enum LinkParseError { Nom(I, ErrorKind), - ThisIsNotPercentEncoding, } impl ParseError for LinkParseError { @@ -451,6 +449,7 @@ fn get_puny_code_warning(link: &str, host: &str) -> Option { } // IRI links per RFC3987 and RFC3986 +#[allow(clippy::integer_arithmetic)] fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let input_ = <&str>::clone(&input); let (input, scheme) = scheme(input)?; @@ -471,7 +470,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { )))(input)?; let path = path.unwrap_or(""); // it's ipath-empty let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?; - let (input, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; + let (_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; let query = query.unwrap_or(""); let fragment = fragment.unwrap_or(""); let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(host.len()).saturating_add(path.len()); @@ -479,7 +478,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { if let Some(link) = input_.get(0..len) { if link.ends_with([':', ';', '.', ',']) { len -= 1; - if path.len() == 0 && query.len() == 0 && fragment.len() == 0 { + if path.is_empty() && query.is_empty() && fragment.is_empty() { host = input_.slice(scheme.len()+3..input_.len()-1); } } From 18677d2fc60431eec91c85d1116d828612b8b816 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Thu, 25 Apr 2024 21:51:26 +0330 Subject: [PATCH 40/74] overall enhancements. more tbd --- src/parser/link_url/ipv4.rs | 13 ++ src/parser/link_url/ipv6.rs | 93 +++++++++ src/parser/{ => link_url}/link_url.rs | 252 +++++------------------ src/parser/link_url/mod.rs | 103 +++++++++ src/parser/mod.rs | 4 +- src/parser/parse_from_text/find_range.rs | 50 ----- src/parser/utils.rs | 65 ++++++ 7 files changed, 331 insertions(+), 249 deletions(-) create mode 100644 src/parser/link_url/ipv4.rs create mode 100644 src/parser/link_url/ipv6.rs rename src/parser/{ => link_url}/link_url.rs (74%) create mode 100644 src/parser/link_url/mod.rs create mode 100644 src/parser/utils.rs diff --git a/src/parser/link_url/ipv4.rs b/src/parser/link_url/ipv4.rs new file mode 100644 index 0000000..b15374f --- /dev/null +++ b/src/parser/link_url/ipv4.rs @@ -0,0 +1,13 @@ +use nom::{ + character::complete::u8, + combinator::recognize, + sequence::tuple, + IResult, +}; + + +pub fn ipv4(input: &str) -> IResult<&str, &str, CustomError<&str>> { + let (input, ipv4_) = + recognize(tuple((u8, char('.'), u8, char('.'), u8, char('.'), u8)))(input)?; + Ok((input, ipv4_)) +} diff --git a/src/parser/link_url/ipv6.rs b/src/parser/link_url/ipv6.rs new file mode 100644 index 0000000..047bf7e --- /dev/null +++ b/src/parser/link_url/ipv6.rs @@ -0,0 +1,93 @@ +use nom::{ + branch::alt, + Slice, + bytes::complete::{tag, take_while, take_while1, take_while_m_n}, + character::complete::{char, u8}, + combinator::{opt, recognize}, + error::{ErrorKind, ParseError}, + multi::{count, many0, many1, many_m_n}, + sequence::tuple, + IResult, +}; + +use super::ipv4::ipv4; + +// consume 1 to 4 hex digit(s) +// TODO These 4 functions should be macros instead +fn h16(input: &str) -> IResult<&str, &str, CustomError<&str>> { + take_while_m_n(1, 4, is_hex_digit)(input) +} + +// consume or an ipv4 +fn ls32(input: &str) -> IResult<&str, &str, CustomError<&str>> { + let result = recognize(tuple((h16, char(':'), h16)))(input); + if result.is_err() { + ipv4(input) + } else { + result + } +} + +// consume +fn h16_and_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(tuple((h16, char(':'))))(input) +} + + +fn double_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { + tag("::")(input) +} + +fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { + // an IPv6 is one of these: + alt(( + // <6 h16_and_period> + recognize(tuple((count(h16_and_period, 6), ls32))), + // :: <5 h16_and_period> + recognize(tuple((double_period, many_m_n(5, 5, h16_and_period), ls32))), + // [h16] :: <4 h16_and_period> + recognize(tuple(( + opt(h16), + double_period, + count(h16_and_period, 4), + ls32, + ))), + // [h16_and_period] :: <3*h16_and_period> + recognize(tuple(( + opt(tuple((many_m_n(0, 1, h16_and_period),))), + double_period, + count(h16_and_period, 3), + ls32, + ))), + // [<0 to 2 h16_and_period> ] :: <2*h16_and_period> + recognize(tuple(( + opt(tuple((many_m_n(0, 2, h16_and_period), h16))), + double_period, + count(h16_and_period, 2), + ls32, + ))), + // [<0 to 3 h16_and_period>] :: + recognize(tuple(( + opt(tuple((many_m_n(0, 3, h16_and_period), h16))), + double_period, + ls32, + ))), + // [<0 to 4 h16_and_period>] :: + recognize(tuple(( + opt(tuple((many_m_n(0, 4, h16_and_period), h16))), + double_period, + ls32, + ))), + // [<0 to 5 h16_and_period>] :: + recognize(tuple(( + opt(tuple((many_m_n(0, 5, h16_and_period), h16))), + double_period, + h16, + ))), + // [<0 to 6 h16_and_period>] :: + recognize(tuple(( + opt(tuple((many_m_n(0, 6, h16_and_period), h16))), + double_period, + ))), + ))(input) +} diff --git a/src/parser/link_url.rs b/src/parser/link_url/link_url.rs similarity index 74% rename from src/parser/link_url.rs rename to src/parser/link_url/link_url.rs index 9c4c05c..0d13f01 100644 --- a/src/parser/link_url.rs +++ b/src/parser/link_url/link_url.rs @@ -12,42 +12,15 @@ use nom::{ IResult, }; -use super::parse_from_text::{ - base_parsers::{is_not_white_space, CustomError}, - find_range::is_in_one_of_ranges, +use crate::parser::{ + parse_from_text::{ + base_parsers::{is_not_white_space, CustomError}, + find_range::is_in_one_of_ranges, + }, + utils::{is_alpha, is_hex_digit, is_digit, find_range}, }; -// Link syntax here is according to RFC 3986 & 3987 --Farooq - -///! Parsing / Validation of URLs -/// -/// - hyperlinks (:// scheme) -/// - whitelisted scheme (: scheme) -/// -/// for hyperlinks it also checks whether the domain contains punycode - -// There are two kinds of Urls -// - Common Internet Scheme https://datatracker.ietf.org/doc/html/rfc1738#section-3.1 -// - Every other url (like mailto) - -#[derive(Debug, PartialEq, Eq, Serialize)] -pub struct LinkDestination<'a> { - pub target: &'a str, - /// hostname if it was found - pub hostname: Option<&'a str>, - /// contains data for the punycode warning if punycode was detected - /// (the host part contains non ascii unicode characters) - pub punycode: Option, - /// scheme - pub scheme: &'a str, -} - -#[derive(Debug, PartialEq, Eq, Serialize)] -pub struct PunycodeWarning { - pub original_hostname: String, - pub ascii_hostname: String, - pub punycode_encoded_url: String, -} +use super::{ipv4::ipv4, ipv6::ipv6}; /// determines which generic schemes (without '://') get linkifyed fn is_allowed_generic_scheme(scheme: &str) -> bool { @@ -68,85 +41,6 @@ fn is_allowed_generic_scheme(scheme: &str) -> bool { ) } -impl LinkDestination<'_> { - /// parse a link that is not in a delimited link or a labled link, just a part of normal text - /// it has a whitelist of schemes, because otherwise - /* - pub(crate) fn parse_standalone_with_whitelist( - input: &str, - ) -> IResult<&str, LinkDestination, CustomError<&str>> { - if let Ok((rest, link_destination)) = parse_link(input) { - if link_destination.hostname.is_none() { - // if it's a generic url like geo:-15.5,41.1 - if !is_allowed_generic_scheme(link_destination.scheme) { - Err(nom::Err::Error(CustomError::InvalidLink)) - } else { - Ok((rest, link_destination)) - } - } else { - Ok(( - rest, - link_destination - )) - } - } else { - Err(nom::Err::Error(CustomError::InvalidLink)) - } - } -*/ - pub fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { - if let Ok((rest, link_destination)) = parse_link(input) { - Ok(( - rest, - link_destination - )) - } else { - Err(nom::Err::Error(CustomError::InvalidLink)) - } - } - - pub fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { - let (mut remaining, mut link) = Self::parse(input)?; - if let Some(first) = remaining.chars().next() { - if matches!(first, ';' | '.' | ',' | ':') { - #[allow(clippy::integer_arithmetic)] - let point = link.target.len() + 1; - link.target = input.slice(..point); - remaining = input.slice(point..); - } - } - println!("BEFORE {remaining} {link:?}"); - Ok((remaining, link)) - } -} - - -#[derive(Debug, PartialEq, Eq)] -pub enum LinkParseError { - Nom(I, ErrorKind), -} - -impl ParseError for LinkParseError { - fn from_error_kind(input: I, kind: ErrorKind) -> Self { - LinkParseError::Nom(input, kind) - } - - fn append(_: I, _: ErrorKind, other: Self) -> Self { - other - } -} - -fn is_alpha(c: char) -> bool { - c.is_alphabetic() -} - -fn is_hex_digit(c: char) -> bool { - c.is_ascii_hexdigit() -} - -fn is_digit(c: char) -> bool { - c.is_ascii_digit() -} // These ranges have been extracted from RFC3987, Page 8. const UCSCHAR_RANGES: [RangeInclusive; 17] = [ @@ -202,93 +96,11 @@ fn is_other_scheme(c: char) -> bool { matches!(c, '+' | '-' | '.') } -fn ipv4(input: &str) -> IResult<&str, &str, CustomError<&str>> { - let (input, ipv4_) = - recognize(tuple((u8, char('.'), u8, char('.'), u8, char('.'), u8)))(input)?; - Ok((input, ipv4_)) -} - fn is_ireg_name_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) } -fn h16(input: &str) -> IResult<&str, &str, CustomError<&str>> { - take_while_m_n(1, 4, is_hex_digit)(input) -} - -fn ls32(input: &str) -> IResult<&str, &str, CustomError<&str>> { - let result = recognize(tuple((h16, char(':'), h16)))(input); - if result.is_err() { - ipv4(input) - } else { - result - } -} -fn h16_and_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple((h16, char(':'))))(input) -} - -fn double_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { - tag("::")(input) -} - -fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { - alt(( - recognize(tuple((count(h16_and_period, 6), ls32))), - recognize(tuple((double_period, many_m_n(5, 5, h16_and_period), ls32))), - recognize(tuple(( - opt(h16), - double_period, - count(h16_and_period, 4), - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 1, h16_and_period),))), - double_period, - count(h16_and_period, 3), - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 2, h16_and_period), h16))), - double_period, - count(h16_and_period, 2), - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 3, h16_and_period), h16))), - double_period, - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 4, h16_and_period), h16))), - double_period, - ls32, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 5, h16_and_period), h16))), - double_period, - h16, - ))), - recognize(tuple(( - opt(tuple((many_m_n(0, 6, h16_and_period), h16))), - double_period, - ))), - ))(input) -} - -fn is_ipvfuture_last(c: char) -> bool { - is_unreserved(c) || is_sub_delim(c) || c == ':' -} - -fn ipvfuture(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple(( - char('v'), - take_while_m_n(1, 1, is_hex_digit), - char('.'), - take_while_m_n(1, 1, is_ipvfuture_last), - )))(input) -} fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((char('['), alt((ipv6, ipvfuture)), char(']'))))(input) @@ -300,6 +112,13 @@ fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { /// /// Parse host. Returns the rest, the host string and a boolean indicating /// if it is IPvFuture or IPv6. +/// +/// A host is either an IP-Literal(IPv6 or vFuture) or an +/// IPv4 or an Ireg name(e.g. far.chickenkiller.com :) +/// +/// # Return value +/// - `(host, true)` if host is IP-Literal +/// - `(host, false)` if it's ipv4 or ireg-name fn parse_host(input: &str) -> IResult<&str, (&str, bool), CustomError<&str>> { match ip_literal(input) { Ok((input, host)) => { @@ -321,6 +140,13 @@ fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> { ))))(input) } + +/// Parse the iauthority block +/// # Description +/// An iauthority is... +/// [iuserinfo] [:port] +/// # Return value +/// unconsumed string AND `(iauthority, host, is_ipliteral)` where `ipliteral` is a boolean fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> /* (iauthority, host, bool) */ { let i = <&str>::clone(&input); @@ -337,6 +163,7 @@ fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str } } +/// Consume an iuserinfo fn take_while_iuserinfo(input: &str) -> IResult<&str, &str, CustomError<&str>> { alt(( recognize(many0(take_while_pct_encoded)), @@ -377,6 +204,8 @@ fn is_iquery_not_pct_encoded(c: char) -> bool { is_iprivate(c) || is_ipchar_not_pct_encoded(c) || matches!(c, '/' | '?') } + +/// Consume an iquery block fn iquery(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(alt(( take_while1(is_iquery_not_pct_encoded), @@ -392,6 +221,19 @@ fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { ))))(input) } + +/// Consume scheme characters from input +/// +/// # Description +/// This function as it can be seen, consumes exactly an alpha and as many +/// scheme characters as there are. then it gets a slice of input(as cloned to i) +/// +/// # Arguments +/// +/// - `input` the input string +/// +/// # Return value +/// (unconsumed input AND the scheme string in order) OR Error fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { let i = <&str>::clone(&input); let (input, _first) = take_while_m_n(1, 1, is_alpha)(input)?; @@ -405,10 +247,14 @@ fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { } } + +/// Take as many pct encoded blocks as there are. a block is %XX where X is a hex digit fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many1(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit)))))(input) } + +/// encode a host to punycode encoded string fn punycode_encode(host: &str) -> String { host.split('.') .map(|sub| { @@ -426,6 +272,8 @@ fn punycode_encode(host: &str) -> String { .join(".") } + +/// Returns true if host string contains non ASCII characters fn is_puny(host: &str) -> bool { for ch in host.chars() { if !(ch.is_ascii_alphanumeric() || matches!(ch, '.' | '-')) { @@ -435,6 +283,7 @@ fn is_puny(host: &str) -> bool { false } +/// Return a PunycodeWarning struct if host need punycode encoding else None fn get_puny_code_warning(link: &str, host: &str) -> Option { if is_puny(host) { let ascii_hostname = punycode_encode(host); @@ -452,9 +301,14 @@ fn get_puny_code_warning(link: &str, host: &str) -> Option { #[allow(clippy::integer_arithmetic)] fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let input_ = <&str>::clone(&input); + // a link is :// [ipath] [iquery] [ifragment] let (input, scheme) = scheme(input)?; + // ^ parse scheme let (input, _period_double_slash) = tag("://")(input)?; + // ^ hey do I need to explain this, too? let (input, (authority, mut host, is_ipv6_or_future)) = iauthority(input)?; + // host is actually part of authority but we need it separately + // see iauthority function description for more information let (input, path) = opt(alt(( recognize(tuple(( char('/'), @@ -468,13 +322,17 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { many0(tuple((char('/'), opt(take_while_ipchar1)))), ))), // ipath-rootless )))(input)?; + // ^ parse one of ipath-absolute or ipath-rootless or none + // which in the third case it's down to ipath-empty(see below) let path = path.unwrap_or(""); // it's ipath-empty let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?; let (_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; - let query = query.unwrap_or(""); - let fragment = fragment.unwrap_or(""); + let query = query.unwrap_or(""); // in the case of no iquery + let fragment = fragment.unwrap_or(""); // in the case of no ifragment let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(host.len()).saturating_add(path.len()); + // compute length of authority + host + path let mut len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); + // compute length of link which is ihier_len + scheme + query + fragment if let Some(link) = input_.get(0..len) { if link.ends_with([':', ';', '.', ',']) { len -= 1; diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs new file mode 100644 index 0000000..b0a1072 --- /dev/null +++ b/src/parser/link_url/mod.rs @@ -0,0 +1,103 @@ +mod link_url; +mod ipv6; +mod ipv4; + +use super::link_url::parse_link; + + +///! Parsing / Validation of URLs +/// +/// - hyperlinks (:// scheme) +/// - whitelisted scheme (: scheme) +/// +/// for hyperlinks it also checks whether the domain contains punycode + +// There are two kinds of Urls +// - Common Internet Scheme[1] +// - Every other url (like mailto) +// [1] RFC1738(Section 3.1), RFC3987, RFC3988 --Farooq + +#[derive(Debug, PartialEq, Eq, Serialize)] +pub struct LinkDestination<'a> { + pub target: &'a str, + /// hostname if it was found + pub hostname: Option<&'a str>, + /// contains data for the punycode warning if punycode was detected + /// (the host part contains non ascii unicode characters) + pub punycode: Option, + /// scheme + pub scheme: &'a str, +} + +#[derive(Debug, PartialEq, Eq, Serialize)] +pub struct PunycodeWarning { + pub original_hostname: String, + pub ascii_hostname: String, + pub punycode_encoded_url: String, +} + +impl LinkDestination<'_> { + /// parse a link that is not in a delimited link or a labled link, just a part of normal text + /// it has a whitelist of schemes, because otherwise + /* + pub(crate) fn parse_standalone_with_whitelist( + input: &str, + ) -> IResult<&str, LinkDestination, CustomError<&str>> { + if let Ok((rest, link_destination)) = parse_link(input) { + if link_destination.hostname.is_none() { + // if it's a generic url like geo:-15.5,41.1 + if !is_allowed_generic_scheme(link_destination.scheme) { + Err(nom::Err::Error(CustomError::InvalidLink)) + } else { + Ok((rest, link_destination)) + } + } else { + Ok(( + rest, + link_destination + )) + } + } else { + Err(nom::Err::Error(CustomError::InvalidLink)) + } + } +*/ + pub fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { + if let Ok((rest, link_destination)) = parse_link(input) { + Ok(( + rest, + link_destination + )) + } else { + Err(nom::Err::Error(CustomError::InvalidLink)) + } + } + + pub fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { + let (mut remaining, mut link) = Self::parse(input)?; + if let Some(first) = remaining.chars().next() { + if matches!(first, ';' | '.' | ',' | ':') { + let point = link.target.len().saturating_add(1); + link.target = input.slice(..point); + remaining = input.slice(point..); + } + } + Ok((remaining, link)) + } +} + + +#[derive(Debug, PartialEq, Eq)] +pub enum LinkParseError { + Nom(I, ErrorKind), +} + +impl ParseError for LinkParseError { + fn from_error_kind(input: I, kind: ErrorKind) -> Self { + LinkParseError::Nom(input, kind) + } + + fn append(_: I, _: ErrorKind, other: Self) -> Self { + other + } +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index dc71d0f..92fe372 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,7 +1,7 @@ // mod email; pub mod parse_from_text; - -mod link_url; +pub mod link_url; +mod utils; pub use link_url::LinkDestination; diff --git a/src/parser/parse_from_text/find_range.rs b/src/parser/parse_from_text/find_range.rs index 20e410e..8b13789 100644 --- a/src/parser/parse_from_text/find_range.rs +++ b/src/parser/parse_from_text/find_range.rs @@ -1,51 +1 @@ -use std::ops::RangeInclusive; -#[derive(Debug, PartialEq, Eq)] -enum FindRangeResult<'a> { - WasOnRangeStart, - Range(&'a RangeInclusive), -} - -/// Find a range which `code` might be in it. -/// -/// # Description -/// This function gets a sorted slice of inclusive u32 ranges, performs -/// binary search on them and returns a FindRangeResult enum telling -/// which range the `code` might be in. It returns `FindRangeResult::WasOnRangeStart` -/// if the code was exactly on start of a range. Or a `FindRangeResult::Range(range)` -/// which indicates `code` is in `range` or in no ranges. -/// -/// # Arguments -/// -/// - `code` the u32 to look for a range for. -/// -/// - `ranges` a refernce to a slice of `RangeInclusive` -fn find_range_for_char(code: u32, ranges: &'_ [RangeInclusive]) -> FindRangeResult<'_> { - let index = ranges.binary_search_by_key(&code, |range| *range.start()); - match index { - Ok(_) => FindRangeResult::WasOnRangeStart, - Err(index) => match index { - #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] - 0 => FindRangeResult::Range(&ranges[0]), - // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the - // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. - // Therefore, `index - 1` will never panic if we index the array with it. - #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] - index => FindRangeResult::Range(&ranges[index - 1]), - }, - } -} - -/// Returns true of `c` is one of the `ranges`, false otherwise. -/// -/// # Arguments -/// -/// - `c` A number(u32) -/// -/// - `ranges` A sorted slice of ranges to see if `c` is in anyone of them -pub fn is_in_one_of_ranges(c: u32, ranges: &[RangeInclusive]) -> bool { - match find_range_for_char(c, ranges) { - FindRangeResult::WasOnRangeStart => true, - FindRangeResult::Range(range) => range.contains(&c), - } -} diff --git a/src/parser/utils.rs b/src/parser/utils.rs new file mode 100644 index 0000000..293841d --- /dev/null +++ b/src/parser/utils.rs @@ -0,0 +1,65 @@ +use std::ops::RangeInclusive; + +#[derive(Debug, PartialEq, Eq)] +enum FindRangeResult<'a> { + WasOnRangeStart, + Range(&'a RangeInclusive), +} + +/// Find a range which `code` might be in it. +/// +/// # Description +/// This function gets a sorted slice of inclusive u32 ranges, performs +/// binary search on them and returns a FindRangeResult enum telling +/// which range the `code` might be in. It returns `FindRangeResult::WasOnRangeStart` +/// if the code was exactly on start of a range. Or a `FindRangeResult::Range(range)` +/// which indicates `code` is in `range` or in no ranges. +/// +/// # Arguments +/// +/// - `code` the u32 to look for a range for. +/// +/// - `ranges` a refernce to a slice of `RangeInclusive` +fn find_range_for_char(code: u32, ranges: &'_ [RangeInclusive]) -> FindRangeResult<'_> { + let index = ranges.binary_search_by_key(&code, |range| *range.start()); + match index { + Ok(_) => FindRangeResult::WasOnRangeStart, + Err(index) => match index { + #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] + 0 => FindRangeResult::Range(&ranges[0]), + // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the + // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. + // Therefore, `index - 1` will never panic if we index the array with it. + #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] + index => FindRangeResult::Range(&ranges[index - 1]), + }, + } +} + +/// Returns true of `c` is one of the `ranges`, false otherwise. +/// +/// # Arguments +/// +/// - `c` A number(u32) +/// +/// - `ranges` A sorted slice of ranges to see if `c` is in anyone of them +pub fn is_in_one_of_ranges(c: u32, ranges: &[RangeInclusive]) -> bool { + match find_range_for_char(c, ranges) { + FindRangeResult::WasOnRangeStart => true, + FindRangeResult::Range(range) => range.contains(&c), + } +} + + +// TODO: Convert these to macros +pub fn is_alpha(c: char) -> bool { + c.is_alphabetic() +} + +pub fn is_hex_digit(c: char) -> bool { + c.is_ascii_hexdigit() +} + +pub fn is_digit(c: char) -> bool { + c.is_ascii_digit() +} From c6567adcc6c7dd7761622216f70146cf04b10b0f Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Thu, 25 Apr 2024 21:53:40 +0330 Subject: [PATCH 41/74] fixing tests --- tests/text_to_ast/markdown.rs | 113 ++++++++++++++++++++++++++-------- tests/text_to_ast/mod.rs | 38 +++++++++++- 2 files changed, 123 insertions(+), 28 deletions(-) diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index d3dc0e3..f380737 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -1,5 +1,6 @@ use super::*; use deltachat_message_parser::parser::parse_markdown_text; +use deltachat_message_parser::parser::LinkDestination; #[test] fn bold_capitalized_command_suggestion() { @@ -493,43 +494,105 @@ fn email_address_example() { #[test] fn link() { - let test_cases = vec![ - "http://delta.chat", - "http://delta.chat:8080", - "http://localhost", - "http://127.0.0.0", - "https://delta.chat", - "ftp://delta.chat", - "https://delta.chat/en/help", - "https://delta.chat/en/help?hi=5&e=4", - "https://delta.chat?hi=5&e=4", - "https://delta.chat/en/help?hi=5&e=4#section2.0", - "https://delta#section2.0", - "http://delta.chat:8080?hi=5&e=4#section2.0", - "http://delta.chat:8080#section2.0", - "mailto:delta@example.com", - "mailto:delta@example.com?subject=hi&body=hello%20world", - "mailto:foö@ü.chat", - "https://ü.app#help", // TODO add more url test cases + let test_cases_no_puny = vec![ + ( + "http://delta.chat", + http_link_no_puny("http://delta.chat", "delta.chat"), + ), + ( + "http://delta.chat:8080", + http_link_no_puny("http://delta.chat:8080", "delta.chat"), + ), + ( + "http://localhost", + http_link_no_puny("http://localhost", "localhost"), + ), + ( + "http://127.0.0.1", + http_link_no_puny("http://127.0.0.1", "127.0.0.1"), + ), + ( + "https://delta.chat", + https_link_no_puny("http://delta.chat", "delta.chat"), + ), + ( + "ftp://delta.chat", + ftp_link_no_puny("ftp://delta.chat", "delta.chat"), + ), + ( + "https://delta.chat/en/help", + https_link_no_puny("https://delta.chat/en/help", "delta.chat"), + ), + ( + "https://delta.chat?hi=5&e=4", + https_link_no_puny("https://delta.chat?hi=5&e=4", "delta.chat"), + ), + ( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + https_link_no_puny("https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat"), + ), + ( + "https://delta#section2.0", + https_link_no_puny("https://delta#section2.0", "delta"), + ), + ( + "http://delta.chat:8080?hi=5&e=4#section2.0", + http_link_no_puny("http://delta.chat:8080?hi=5&e=4#section2.0", "delta.chat"), + ), + ( + "http://delta.chat:8080#section2.0", + http_link_no_puny("http://delta.chat:8080#section2.0", "delta.chat"), + ), + ( + "mailto:delta@example.com", + mailto_link_no_puny("mailto:delta@example.com", "example.com"), + ), + ( + "mailto:delta@example.com?subject=hi&body=hello%20world", + mailto_link_no_puny("mailto:delta@example.com?subject=hi&body=hello%20world", "example.com"), + ), + ]; + + let test_cases_with_puny = [ + ( + "mailto:foö@ü.chat", + mailto_link_no_puny("mailto:foö@ü.chat", "ü.chat"), + ), + ( + "https://ü.app#help", + https_link_no_puny("https://ü.app#help", "ü.app") + ) ]; - for input in &test_cases { + + for (input, destination) in &test_cases_no_puny { println!("testing {}", input); assert_eq!( parse_markdown_text(input), vec![Link { - destination: link_destination_for_testing(input) + destination: *destination }] ); } - for input in &test_cases { + for (input, destination) in &test_cases_with_puny { println!("testing <{}>", input); + let result = parse_markdown_text(input)[0].destination; assert_eq!( - parse_markdown_text(input), - vec![Link { - destination: link_destination_for_testing(input) - }] + result.target, + destination.target + ); + assert_eq!( + result.scheme, + destination.scheme + ); + assert_eq!( + result.hostname, + destination.hostname, + ); + assert_eq!( + result.punycode.is_some(), + true ); } } diff --git a/tests/text_to_ast/mod.rs b/tests/text_to_ast/mod.rs index af385cb..3a3f3cc 100644 --- a/tests/text_to_ast/mod.rs +++ b/tests/text_to_ast/mod.rs @@ -1,8 +1,40 @@ use deltachat_message_parser::parser::Element::*; -use deltachat_message_parser::parser::LinkDestination; +use deltachat_message_parser::parser::{LinkDestination, PunycodeWarning}; -pub fn link_destination_for_testing(trusted_real_url: &str) -> LinkDestination { - LinkDestination::parse(trusted_real_url).unwrap().1 +fn http_link_no_puny(target: &str, hostname: &str) -> LinkDestination { + LinkDestination { + target, + hostname: Some(hostname), + scheme: "http", + punycode: None + } +} + +fn https_link_no_puny(target: &str, hostname: &str) -> LinkDestination { + LinkDestination { + target, + hostname: Some(hostname), + scheme: "http", + punycode: None + } +} + +fn http_link_no_puny(target: &str, hostname: &str) -> LinkDestination { + LinkDestination { + target, + hostname: Some(hostname), + scheme: "ftp", + punycode: None + } +} + +fn mailto_link_no_puny(target: &str, hostname: &str) -> LinkDestination { + LinkDestination { + target, + hostname: Some(hostname), + scheme: "mailto", + punycode: None, + } } mod desktop_set; From 3b9f466b499efe0e1853c71ac38cae6cacf2eaf4 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Thu, 25 Apr 2024 22:00:56 +0330 Subject: [PATCH 42/74] reconstructing tests --- tests/text_to_ast/markdown.rs | 57 +++++++++++++++++++----------- tests/text_to_ast/text_only.rs | 64 +++------------------------------- 2 files changed, 40 insertions(+), 81 deletions(-) diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index f380737..9288c11 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -605,11 +605,10 @@ fn test_link_example() { ), vec![ Text("This is an my site: "), - Link { - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" - ) - }, + http_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat" + ), Linebreak, Text("Visit me there") ] @@ -637,11 +636,10 @@ fn test_delimited_link_example() { ), vec![ Text("This is an my site: "), - Link { - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" - ) - }, + https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat" + ), Linebreak, Text("Visit me there") ] @@ -654,9 +652,10 @@ fn labeled_link() { parse_markdown_text("[a link](https://delta.chat/en/help?hi=5&e=4#section2.0)"), vec![LabeledLink { label: vec![Text("a link")], - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" - ) + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat" + ), }] ); assert_eq!( @@ -665,9 +664,10 @@ fn labeled_link() { ), vec![LabeledLink { label: vec![Text("rich content "), Bold(vec![Text("bold")])], - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" - ) + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat" + ), }] ); } @@ -680,7 +680,10 @@ fn labeled_link_example() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: link_destination_for_testing("https://delta.chat/en/help") + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat" + ), }, Text(".") ] @@ -695,7 +698,10 @@ fn labeled_link_can_have_comma_or_dot_at_end() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: link_destination_for_testing("https://delta.chat/en/help.") + destination: https_link_no_puny( + "https://delta.chat/en/help.", + "delta.chat" + ), }, Text(".") ] @@ -706,7 +712,10 @@ fn labeled_link_can_have_comma_or_dot_at_end() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: link_destination_for_testing("https://delta.chat/en/help,") + destination: https_link_no_puny( + "https://delta.chat/en/help,", + "delta.chat" + ), }, Text(".") ] @@ -717,7 +726,10 @@ fn labeled_link_can_have_comma_or_dot_at_end() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: link_destination_for_testing("https://delta.chat/en/help:") + destination: https_link_no_puny( + "https://delta.chat/en/help:", + "delta.chat" + ), }, Text(".") ] @@ -728,7 +740,10 @@ fn labeled_link_can_have_comma_or_dot_at_end() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: link_destination_for_testing("https://delta.chat/en/help;") + destination: https_link_no_puny( + "https://delta.chat/en/help;", + "delta.chat" + ), }, Text(".") ] diff --git a/tests/text_to_ast/text_only.rs b/tests/text_to_ast/text_only.rs index e732e7f..5cac7ea 100644 --- a/tests/text_to_ast/text_only.rs +++ b/tests/text_to_ast/text_only.rs @@ -271,63 +271,6 @@ fn email_address_do_not_parse_last_char_if_special() { ); } -#[test] -fn link() { - let test_cases = vec![ - "http://delta.chat", - "http://delta.chat:8080", - "http://localhost", - "http://127.0.0.0", - "https://delta.chat", - "ftp://delta.chat", - "https://delta.chat/en/help", - "https://delta.chat/en/help?hi=5&e=4", - "https://delta.chat?hi=5&e=4", - "https://delta.chat/en/help?hi=5&e=4#section2.0", - "https://delta#section2.0", - "http://delta.chat:8080?hi=5&e=4#section2.0", - "http://delta.chat:8080#section2.0", - "mailto:delta@example.com", - "mailto:delta@example.com?subject=hi&body=hello%20world", - "mailto:foö@ü.chat", - "https://ü.app#help", // TODO add more urls for testing - ]; - - for input in &test_cases { - println!("testing {}", input); - assert_eq!( - parse_only_text(input), - vec![Link { - destination: link_destination_for_testing(input) - }] - ); - } - - for input in &test_cases { - println!("testing <{}>", input); - assert_eq!( - parse_only_text(input), - vec![Link { - destination: link_destination_for_testing(input) - }] - ); - } - - let input = "http://[2001:0db8:85a3:08d3::0370:7344]:8080/"; - let hostname = "[2001:0db8:85a3:08d3::0370:7344]"; - assert_eq!( - parse_only_text(input), - vec![Link { - destination: LinkDestination { - target: input, - hostname: Some(hostname), - punycode: None, - scheme: "http" - } - }] - ); -} - #[test] fn test_link_example() { assert_eq!( @@ -337,8 +280,9 @@ fn test_link_example() { vec![ Text("This is an my site: "), Link { - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat", ) }, Linebreak, @@ -352,7 +296,7 @@ fn delimited_email_should_not_work() { assert_ne!( parse_only_text("This is an my site: \nMessage me there"), vec![ - Text("This is an my site: "), + Text("This is an my email: "), EmailAddress("hello@delta.chat"), Linebreak, Text("Message me there") From 0e1d971ef9a3b3c3ecb41ba11cb51b0c41c7a741 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Thu, 25 Apr 2024 22:15:13 +0330 Subject: [PATCH 43/74] I think I've fixed tests --- tests/text_to_ast/text_only.rs | 47 +++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/tests/text_to_ast/text_only.rs b/tests/text_to_ast/text_only.rs index 5cac7ea..a6bdbb2 100644 --- a/tests/text_to_ast/text_only.rs +++ b/tests/text_to_ast/text_only.rs @@ -313,8 +313,9 @@ fn delimited_link_should_not_work() { vec![ Text("This is an my site: "), Link { - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat", ) }, Linebreak, @@ -329,8 +330,9 @@ fn labeled_link_should_not_work() { parse_only_text("[a link](https://delta.chat/en/help?hi=5&e=4#section2.0)"), vec![LabeledLink { label: vec![Text("a link")], - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat", ) }] ); @@ -338,8 +340,9 @@ fn labeled_link_should_not_work() { parse_only_text("[rich content **bold**](https://delta.chat/en/help?hi=5&e=4#section2.0)"), vec![LabeledLink { label: vec![Text("rich content "), Bold(vec![Text("bold")])], - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat", ) }] ); @@ -353,7 +356,7 @@ fn labeled_link_example_should_not_work() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: link_destination_for_testing("https://delta.chat/en/help") + destination: https_link_no_puny("https://delta.chat/en/help", "delta.chat") }, Text(".") ] @@ -367,7 +370,7 @@ fn link_do_not_consume_last_comma() { vec![ Text("you can find the details on "), Link { - destination: link_destination_for_testing("https://delta.chat/en/help") + destination: https_link_no_puny("https://delta.chat/en/help", "delta.chat") }, Text(",") ] @@ -381,7 +384,7 @@ fn link_do_not_consume_last_semicolon_or_colon() { vec![ Text("you can find the details on "), Link { - destination: link_destination_for_testing("https://delta.chat/en/help") + destination: https_link_no_puny("https://delta.chat/en/help", "delta.chat") }, Text(";") ] @@ -391,7 +394,7 @@ fn link_do_not_consume_last_semicolon_or_colon() { vec![ Text("you can find the details on "), Link { - destination: link_destination_for_testing("https://delta.chat/en/help") + destination: https_link_no_puny("https://delta.chat/en/help", "delta.chat") }, Text(":") ] @@ -405,7 +408,7 @@ fn link_do_not_consume_last_dot() { vec![ Text("you can find the details on "), Link { - destination: link_destination_for_testing("https://delta.chat/en/help") + destination: https_link_no_puny("https://delta.chat/en/help", "delta.chat") }, Text(".") ] @@ -415,7 +418,7 @@ fn link_do_not_consume_last_dot() { vec![ Text("you can find the details on "), Link { - destination: link_destination_for_testing("https://delta.chat/en/help.txt") + destination: https_link_no_puny("https://delta.chat/en/help.txt", "delta.chat") }, Text(".") ] @@ -429,7 +432,7 @@ fn link_with_file_extention() { vec![ Text("you can find the details on "), Link { - destination: link_destination_for_testing("https://delta.chat/en/help.html") + destination: https_link_no_puny("https://delta.chat/en/help.html", "delta.chat") } ] ); @@ -442,7 +445,7 @@ fn parenthesis_in_links() { vec![ Text("links can contain parenthesis, "), Link { - destination: link_destination_for_testing("https://en.wikipedia.org/wiki/Bracket_(disambiguation)") + destination: https_link_no_puny("https://en.wikipedia.org/wiki/Bracket_(disambiguation)", "en.wikipedia.org") }, Text(" is an example of this.") ] @@ -458,8 +461,9 @@ fn link_in_parenthesis() { vec![ Text("for more information see ("), Link { - destination: link_destination_for_testing( - "https://github.com/deltachat/message-parser/issues/12" + destination: https_link_no_puny( + "https://github.com/deltachat/message-parser/issues/12", + "github.com" ) }, Text(")") @@ -474,7 +478,7 @@ fn link_with_parenthesis_in_parenthesis() { vec![ Text("there are links that contain parenthesis (for example "), Link { - destination: link_destination_for_testing("https://en.wikipedia.org/wiki/Bracket_(disambiguation)") + destination: https_link_no_puny("https://en.wikipedia.org/wiki/Bracket_(disambiguation)", "en.wikipedia.org") }, Text(")") ] @@ -490,8 +494,9 @@ fn link_with_different_parenthesis_in_parenthesis() { vec![ Text("()(for [example{ "), Link { - destination: link_destination_for_testing( - "https://en.wikipedia.org/wiki/Bracket_(disambiguation)" + destination: https_link_no_puny( + "https://en.wikipedia.org/wiki/Bracket_(disambiguation)", + "en.wikipedia.org" ) }, Text("{[}hi]])}") @@ -505,7 +510,7 @@ fn link_with_descriptive_parenthesis() { parse_only_text("https://delta.chat/page(this is the link to our site)"), vec![ Link { - destination: link_destination_for_testing("https://delta.chat/page") + destination: https_link_no_puny("https://delta.chat/page", "delta.chat") }, Text("(this is the link to our site)") ] @@ -519,7 +524,7 @@ fn link_in_parenthesis2() { vec![ Text("A great chat app (see "), Link { - destination: link_destination_for_testing("https://delta.chat/en/") + destination: https_link_no_puny("https://delta.chat/en/", "delta.chat") }, Text(")") ] From fde44ead508fe7b6be3cc6109334ace680a0d25d Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sat, 27 Apr 2024 19:54:47 +0330 Subject: [PATCH 44/74] major refactoring of iri parsing. more TBD --- src/parser/link_url/ip/ipliteral.rs | 20 +++++ src/parser/link_url/{ => ip}/ipv4.rs | 3 +- src/parser/link_url/{ => ip}/ipv6.rs | 15 ++-- src/parser/link_url/ip/ipvfuture.rs | 22 ++++++ src/parser/link_url/ip/mod.rs | 4 + src/parser/link_url/link_url.rs | 74 ++++++------------- src/parser/link_url/mod.rs | 20 +++-- src/parser/mod.rs | 3 +- src/parser/parse_from_text/base_parsers.rs | 14 +--- .../hashtag_content_char_ranges.rs | 4 +- .../parse_from_text/markdown_elements.rs | 29 +++++--- src/parser/parse_from_text/text_elements.rs | 22 +++--- src/parser/utils.rs | 34 +++++++-- 13 files changed, 157 insertions(+), 107 deletions(-) create mode 100644 src/parser/link_url/ip/ipliteral.rs rename src/parser/link_url/{ => ip}/ipv4.rs (73%) rename src/parser/link_url/{ => ip}/ipv6.rs (90%) create mode 100644 src/parser/link_url/ip/ipvfuture.rs create mode 100644 src/parser/link_url/ip/mod.rs diff --git a/src/parser/link_url/ip/ipliteral.rs b/src/parser/link_url/ip/ipliteral.rs new file mode 100644 index 0000000..1c8e472 --- /dev/null +++ b/src/parser/link_url/ip/ipliteral.rs @@ -0,0 +1,20 @@ +use nom::{ + branch::alt, + character::complete::char, + combinator::recognize, + sequence::tuple, + IResult, +}; + +use crate::parser::{ + parse_from_text::base_parsers::CustomError, + link_url::ip::{ + ipvfuture::ipvfuture, + ipv6::ipv6, + }, +}; + + +pub fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(tuple((char('['), alt((ipv6, ipvfuture)), char(']'))))(input) +} diff --git a/src/parser/link_url/ipv4.rs b/src/parser/link_url/ip/ipv4.rs similarity index 73% rename from src/parser/link_url/ipv4.rs rename to src/parser/link_url/ip/ipv4.rs index b15374f..c082e56 100644 --- a/src/parser/link_url/ipv4.rs +++ b/src/parser/link_url/ip/ipv4.rs @@ -1,10 +1,11 @@ use nom::{ - character::complete::u8, + character::complete::{u8, char}, combinator::recognize, sequence::tuple, IResult, }; +use crate::parser::parse_from_text::base_parsers::CustomError; pub fn ipv4(input: &str) -> IResult<&str, &str, CustomError<&str>> { let (input, ipv4_) = diff --git a/src/parser/link_url/ipv6.rs b/src/parser/link_url/ip/ipv6.rs similarity index 90% rename from src/parser/link_url/ipv6.rs rename to src/parser/link_url/ip/ipv6.rs index 047bf7e..d3f0546 100644 --- a/src/parser/link_url/ipv6.rs +++ b/src/parser/link_url/ip/ipv6.rs @@ -1,15 +1,18 @@ use nom::{ branch::alt, - Slice, - bytes::complete::{tag, take_while, take_while1, take_while_m_n}, - character::complete::{char, u8}, + bytes::complete::{tag, take_while_m_n}, + character::complete::char, combinator::{opt, recognize}, - error::{ErrorKind, ParseError}, - multi::{count, many0, many1, many_m_n}, + multi::{count, many_m_n}, sequence::tuple, IResult, }; +use crate::parser::{ + parse_from_text::base_parsers::CustomError, + utils::is_hex_digit, +}; + use super::ipv4::ipv4; // consume 1 to 4 hex digit(s) @@ -38,7 +41,7 @@ fn double_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { tag("::")(input) } -fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { +pub fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { // an IPv6 is one of these: alt(( // <6 h16_and_period> diff --git a/src/parser/link_url/ip/ipvfuture.rs b/src/parser/link_url/ip/ipvfuture.rs new file mode 100644 index 0000000..4ba30b1 --- /dev/null +++ b/src/parser/link_url/ip/ipvfuture.rs @@ -0,0 +1,22 @@ +use nom::{ + bytes::complete::take_while_m_n, + character::complete::char, + combinator::recognize, + sequence::tuple, + IResult, +}; + +use crate::parser::utils::{ + is_hex_digit, + is_sub_delim, + is_unreserved, + parse_from_text::base_parsers::CustomError, +}; + +fn is_ipvfuture_last(ch: char) -> bool { + is_sub_delim(ch) || is_unreserved(ch) || ch == ':' +} + +pub fn ipvfuture(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(tuple((char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1, is_ipvfuture_last))))(input) +} diff --git a/src/parser/link_url/ip/mod.rs b/src/parser/link_url/ip/mod.rs new file mode 100644 index 0000000..b24bb70 --- /dev/null +++ b/src/parser/link_url/ip/mod.rs @@ -0,0 +1,4 @@ +mod ipvfuture; +mod ipv6; +pub(crate) mod ipv4; +pub(crate) mod ipliteral; diff --git a/src/parser/link_url/link_url.rs b/src/parser/link_url/link_url.rs index 0d13f01..f9d0c74 100644 --- a/src/parser/link_url/link_url.rs +++ b/src/parser/link_url/link_url.rs @@ -4,24 +4,23 @@ use nom::{ branch::alt, Slice, bytes::complete::{tag, take_while, take_while1, take_while_m_n}, - character::complete::{char, u8}, + character::complete::char, combinator::{opt, recognize}, - error::{ErrorKind, ParseError}, - multi::{count, many0, many1, many_m_n}, + multi::{many0, many1}, sequence::tuple, IResult, }; use crate::parser::{ - parse_from_text::{ - base_parsers::{is_not_white_space, CustomError}, - find_range::is_in_one_of_ranges, + parse_from_text::base_parsers::CustomError, + link_url::{ + PunycodeWarning, + LinkDestination, + ip::{ipv4, ipliteral}, }, - utils::{is_alpha, is_hex_digit, is_digit, find_range}, + utils::{is_not_white_space, is_alpha, is_hex_digit, is_digit, is_in_one_of_ranges, is_sub_delim, is_unreserved}, }; -use super::{ipv4::ipv4, ipv6::ipv6}; - /// determines which generic schemes (without '://') get linkifyed fn is_allowed_generic_scheme(scheme: &str) -> bool { matches!( @@ -67,24 +66,10 @@ fn is_ucschar(c: char) -> bool { is_in_one_of_ranges(c as u32, &UCSCHAR_RANGES[..]) } -fn is_unreserved(c: char) -> bool { - is_alpha(c) || is_digit(c) || is_other_unreserved(c) -} - fn is_iunreserved(c: char) -> bool { is_unreserved(c) || is_ucschar(c) } -fn is_other_unreserved(c: char) -> bool { - matches!(c, '_' | '.' | '-' | '~') -} - -fn is_sub_delim(c: char) -> bool { - matches!( - c, - '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' - ) -} // Here again, order is important. As URLs/IRIs have letters in them // most of the time and less digits or other characters. --Farooq @@ -100,12 +85,6 @@ fn is_ireg_name_not_pct_encoded(c: char) -> bool { is_iunreserved(c) || is_sub_delim(c) } - - -fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple((char('['), alt((ipv6, ipvfuture)), char(']'))))(input) -} - /// Parse host /// /// # Description @@ -284,7 +263,7 @@ fn is_puny(host: &str) -> bool { } /// Return a PunycodeWarning struct if host need punycode encoding else None -fn get_puny_code_warning(link: &str, host: &str) -> Option { +pub fn get_puny_code_warning(link: &str, host: &str) -> Option { if is_puny(host) { let ascii_hostname = punycode_encode(host); Some(PunycodeWarning { @@ -460,27 +439,18 @@ fn parse_generic(input: &str) -> IResult<&str, LinkDestination, CustomError<&str Err(nom::Err::Failure(CustomError::NoContent)) } -pub fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { - /* - match parse_iri(input) { - Ok((input, iri)) => Ok((input, iri)), - Err(..) => parse_irelative_ref(input), - }*/ +pub(super) fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { alt((parse_generic, parse_iri))(input) } -// TODO testcases - -// ipv6 https://[::1]/ - -// invalid ascii domain (without non ascii char: https://-test-/hi ) #[cfg(test)] mod test { #![allow(clippy::unwrap_used)] - use crate::parser::link_url::{parse_link, punycode_encode, PunycodeWarning, LinkDestination}; + use crate::parser::{LinkDestination, link_url::link_url::{punycode_encode, PunycodeWarning}}; #[test] fn basic_parsing() { + let x: LinkDestination; let test_cases_no_puny = vec![ "http://delta.chat", "http://delta.chat:8080", @@ -509,7 +479,7 @@ mod test { ]; for input in &test_cases_no_puny { - let (rest, link_destination) = parse_link(input).expect(&format!("Test failed: {input}")); + let (rest, link_destination) = LinkDestination::parse(input).expect(&format!("Test failed: {input}")); assert_eq!(input, &link_destination.target); assert_eq!(rest.len(), 0); @@ -517,7 +487,7 @@ mod test { } for input in &test_cases_with_puny { - let (rest, link_destination) = parse_link(input).expect("Test failed: {input}"); + let (rest, link_destination) = LinkDestination::parse(input).expect("Test failed: {input}"); assert!(link_destination.punycode.is_some()); assert_eq!(rest.len(), 0); @@ -531,7 +501,7 @@ mod test { for input in &test_cases { println!("testing {input}"); - assert!(parse_link(input).is_err()); + assert!(LinkDestination::parse(input).is_err()); } } #[test] @@ -542,7 +512,7 @@ mod test { #[test] fn punycode_detection() { assert_eq!( - parse_link("http://münchen.de").unwrap().1, + LinkDestination::parse("http://münchen.de").unwrap().1, LinkDestination { hostname: Some("münchen.de"), target: "http://münchen.de", @@ -556,7 +526,7 @@ mod test { ); assert_eq!( - parse_link("http://muenchen.de").unwrap().1, + LinkDestination::parse("http://muenchen.de").unwrap().1, LinkDestination { hostname: Some("muenchen.de"), target: "http://muenchen.de", @@ -569,7 +539,7 @@ mod test { #[test] fn common_schemes() { assert_eq!( - parse_link("http://delta.chat").unwrap(), + LinkDestination::parse("http://delta.chat").unwrap(), ( "", LinkDestination { @@ -581,7 +551,7 @@ mod test { ) ); assert_eq!( - parse_link("https://far.chickenkiller.com").unwrap(), + LinkDestination::parse("https://far.chickenkiller.com").unwrap(), ( "", LinkDestination { @@ -596,7 +566,7 @@ mod test { #[test] fn generic_schemes() { assert_eq!( - parse_link("mailto:someone@example.com").unwrap(), + LinkDestination::parse("mailto:someone@example.com").unwrap(), ( "", LinkDestination { @@ -609,7 +579,7 @@ mod test { ) ); assert_eq!( - parse_link("bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka") + LinkDestination::parse("bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka") .unwrap() .1, LinkDestination { @@ -620,7 +590,7 @@ mod test { } ); assert_eq!( - parse_link("geo:37.786971,-122.399677").unwrap().1, + LinkDestination::parse("geo:37.786971,-122.399677").unwrap().1, LinkDestination { scheme: "geo", punycode: None, diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index b0a1072..85e4bcf 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -1,8 +1,16 @@ mod link_url; -mod ipv6; -mod ipv4; +mod ip; -use super::link_url::parse_link; +use nom::{ + Slice, + IResult, + error::{ParseError, ErrorKind}, +}; + +use crate::parser::{ + parse_from_text::base_parsers::CustomError, + link_url::link_url::parse_link, +}; ///! Parsing / Validation of URLs @@ -30,7 +38,7 @@ pub struct LinkDestination<'a> { } #[derive(Debug, PartialEq, Eq, Serialize)] -pub struct PunycodeWarning { +pub(crate) struct PunycodeWarning { pub original_hostname: String, pub ascii_hostname: String, pub punycode_encoded_url: String, @@ -62,7 +70,7 @@ impl LinkDestination<'_> { } } */ - pub fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { + pub(crate) fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { if let Ok((rest, link_destination)) = parse_link(input) { Ok(( rest, @@ -73,7 +81,7 @@ impl LinkDestination<'_> { } } - pub fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { + pub(crate) fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (mut remaining, mut link) = Self::parse(input)?; if let Some(first) = remaining.chars().next() { if matches!(first, ';' | '.' | ',' | ':') { diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 92fe372..874479c 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3,7 +3,7 @@ pub mod parse_from_text; pub mod link_url; mod utils; -pub use link_url::LinkDestination; +pub use crate::parser::link_url::LinkDestination; /// The representation of Elements for the Abstract Syntax Tree #[derive(Debug, PartialEq, Eq, Serialize)] @@ -66,5 +66,6 @@ pub fn parse_only_text(input: &str) -> std::vec::Vec { /// parses text and delimited/labled link elements to replicate current desktop elements pub fn parse_desktop_set(input: &str) -> std::vec::Vec { + let x: LinkDestination; parse_from_text::parse_desktop_set(input) } diff --git a/src/parser/parse_from_text/base_parsers.rs b/src/parser/parse_from_text/base_parsers.rs index 8b03b6a..7827a47 100644 --- a/src/parser/parse_from_text/base_parsers.rs +++ b/src/parser/parse_from_text/base_parsers.rs @@ -8,6 +8,8 @@ use nom::{ IResult, }; +use crate::parser::utils::is_white_space; + #[derive(Debug, PartialEq, Eq)] pub enum CustomError { NoContent, @@ -57,18 +59,6 @@ impl IntoCustomError for Result { } } -pub(crate) fn is_white_space(c: char) -> bool { - matches!(c, '\n' | '\r' | '\t' | ' ') -} - -pub(crate) fn is_not_white_space(c: char) -> bool { - !is_white_space(c) -} - -pub(crate) fn is_white_space_but_not_linebreak(c: char) -> bool { - matches!(c, '\t' | ' ') -} - /// delimited no whitespace start or end pub(crate) fn direct_delimited<'a>( input: &'a str, diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index a113252..930c105 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -1,4 +1,4 @@ -use crate::parser::parse_from_text::find_range::is_in_one_of_ranges; +use crate::parser::utils::is_in_one_of_ranges; use std::ops::RangeInclusive; const NUMBER_OF_RANGES: usize = 850; @@ -882,7 +882,7 @@ pub(crate) fn hashtag_content_char(c: char) -> bool { #[cfg(test)] mod test { - use crate::parser::parse_from_text::find_range::is_in_one_of_ranges; + use crate::parser::utils::is_in_one_of_ranges; use crate::parser::parse_from_text::hashtag_content_char_ranges::hashtag_content_char; use std::ops::RangeInclusive; diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index bf49d50..7cb98c2 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -1,13 +1,3 @@ -use crate::parser::parse_from_text::text_elements::email_address; - -use super::base_parsers::{ - direct_delimited, is_white_space, is_white_space_but_not_linebreak, CustomError, -}; -use super::text_elements::parse_text_element; -use super::Element; -use super::{base_parsers::*, parse_all}; -use crate::parser::link_url::{LinkDestination, parse_link}; -///! nom parsers for markdown elements use nom::{ bytes::complete::{is_not, tag, take, take_while}, character::complete::alphanumeric1, @@ -16,6 +6,23 @@ use nom::{ IResult, }; +use crate::parser::{ + link_url::LinkDestination, + parse_from_text::{ + text_elements::{ + email_address, + parse_text_element, + }, + base_parsers::direct_delimited, + Element, + } + utils::{ + is_white_space, + is_white_space_but_not_linebreak, + }, +}; +use super::{base_parsers::*, parse_all}; + fn inline_code(input: &str) -> IResult<&str, &str, CustomError<&str>> { delimited(tag("`"), is_not("`"), tag("`"))(input) } @@ -97,7 +104,7 @@ pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError< if content.is_empty() { return Err(nom::Err::Error(CustomError::NoContent)); } - let (rest, destination) = parse_link(input)?; + let (rest, destination) = LinkDestination::parse(input)?; if !rest.is_empty() { return Err(nom::Err::Error(CustomError::UnexpectedContent)); } diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 50d9b21..9bbf8ec 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -1,23 +1,23 @@ ///! nom parsers for text elements -use crate::parser::link_url::parse_link; - -use super::base_parsers::CustomError; -use super::hashtag_content_char_ranges::hashtag_content_char; -use super::Element; -use crate::nom::{Offset, Slice}; -use nom::bytes::complete::take_while; -use nom::character::complete::char; use nom::{ bytes::{ - complete::{tag, take, take_while1}, + complete::{tag, take, take_while1, take_while}, streaming::take_till1, }, + character::complete::char, character, combinator::{peek, recognize, verify}, sequence::tuple, - AsChar, IResult, + AsChar, IResult, Offset, Slice }; + +use crate::parser::link_url::LinkDestination; +use super::base_parsers::CustomError; +use super::hashtag_content_char_ranges::hashtag_content_char; +use super::Element; + + fn linebreak(input: &str) -> IResult<&str, char, CustomError<&str>> { char('\n')(input) } @@ -278,7 +278,7 @@ pub(crate) fn parse_text_element( Ok((i, elm)) } else if let Ok((i, elm)) = email_address(input) { Ok((i, elm)) - } else if let Ok((i, destination)) = parse_link(input) { + } else if let Ok((i, destination)) = LinkDestination::parse(input) { Ok((i, Element::Link { destination })) } else if let Ok((i, _)) = linebreak(input) { Ok((i, Element::Linebreak)) diff --git a/src/parser/utils.rs b/src/parser/utils.rs index 293841d..8713060 100644 --- a/src/parser/utils.rs +++ b/src/parser/utils.rs @@ -50,16 +50,40 @@ pub fn is_in_one_of_ranges(c: u32, ranges: &[RangeInclusive]) -> bool { } } - -// TODO: Convert these to macros -pub fn is_alpha(c: char) -> bool { +// TODO: Convert these(is_alpha, is_hex_digit, is_digit) to macros OR inline +pub(crate) fn is_alpha(c: char) -> bool { c.is_alphabetic() } -pub fn is_hex_digit(c: char) -> bool { +pub(crate) fn is_hex_digit(c: char) -> bool { c.is_ascii_hexdigit() } -pub fn is_digit(c: char) -> bool { +pub(crate) fn is_digit(c: char) -> bool { c.is_ascii_digit() } + +pub(crate) fn is_sub_delim(c: char) -> bool { + matches!( + c, + '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' + ) +} + +pub(crate) fn is_unreserved(c: char) -> bool { + is_alpha(c) || is_digit(c) || matches!(c, '_' | '.' | '-' | '~') +} + +pub(crate) fn is_white_space(c: char) -> bool { + matches!(c, '\n' | '\r' | '\t' | ' ') +} + +pub(crate) fn is_not_white_space(c: char) -> bool { + !is_white_space(c) +} + +pub(crate) fn is_white_space_but_not_linebreak(c: char) -> bool { + matches!(c, '\t' | ' ') +} + + From 78c70b6e42490d8b77965251cdcc4b9f3e145339 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 28 Apr 2024 18:32:54 +0330 Subject: [PATCH 45/74] refactoring, continued --- .../ip/{ipliteral.rs => ip_literal.rs} | 0 src/parser/link_url/ip/ipvfuture.rs | 10 +- src/parser/link_url/ip/mod.rs | 2 +- src/parser/link_url/link_url.rs | 6 +- src/parser/link_url/mod.rs | 2 +- src/parser/mod.rs | 3 +- .../parse_from_text/markdown_elements.rs | 2 +- tests/text_to_ast/desktop_set.rs | 159 ++++++++++++------ tests/text_to_ast/markdown.rs | 68 ++++---- tests/text_to_ast/mod.rs | 14 +- tests/text_to_ast/text_only.rs | 2 +- 11 files changed, 168 insertions(+), 100 deletions(-) rename src/parser/link_url/ip/{ipliteral.rs => ip_literal.rs} (100%) diff --git a/src/parser/link_url/ip/ipliteral.rs b/src/parser/link_url/ip/ip_literal.rs similarity index 100% rename from src/parser/link_url/ip/ipliteral.rs rename to src/parser/link_url/ip/ip_literal.rs diff --git a/src/parser/link_url/ip/ipvfuture.rs b/src/parser/link_url/ip/ipvfuture.rs index 4ba30b1..923e5f6 100644 --- a/src/parser/link_url/ip/ipvfuture.rs +++ b/src/parser/link_url/ip/ipvfuture.rs @@ -6,10 +6,12 @@ use nom::{ IResult, }; -use crate::parser::utils::{ - is_hex_digit, - is_sub_delim, - is_unreserved, +use crate::parser::{ + utils::{ + is_hex_digit, + is_sub_delim, + is_unreserved, + }, parse_from_text::base_parsers::CustomError, }; diff --git a/src/parser/link_url/ip/mod.rs b/src/parser/link_url/ip/mod.rs index b24bb70..75bbe3d 100644 --- a/src/parser/link_url/ip/mod.rs +++ b/src/parser/link_url/ip/mod.rs @@ -1,4 +1,4 @@ mod ipvfuture; mod ipv6; pub(crate) mod ipv4; -pub(crate) mod ipliteral; +pub(crate) mod ip_literal; diff --git a/src/parser/link_url/link_url.rs b/src/parser/link_url/link_url.rs index f9d0c74..c10dbe7 100644 --- a/src/parser/link_url/link_url.rs +++ b/src/parser/link_url/link_url.rs @@ -16,7 +16,10 @@ use crate::parser::{ link_url::{ PunycodeWarning, LinkDestination, - ip::{ipv4, ipliteral}, + ip::{ + ipv4::ipv4, + ip_literal::ip_literal, + }, }, utils::{is_not_white_space, is_alpha, is_hex_digit, is_digit, is_in_one_of_ranges, is_sub_delim, is_unreserved}, }; @@ -450,7 +453,6 @@ mod test { #[test] fn basic_parsing() { - let x: LinkDestination; let test_cases_no_puny = vec![ "http://delta.chat", "http://delta.chat:8080", diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 85e4bcf..2f7ac88 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -38,7 +38,7 @@ pub struct LinkDestination<'a> { } #[derive(Debug, PartialEq, Eq, Serialize)] -pub(crate) struct PunycodeWarning { +pub struct PunycodeWarning { pub original_hostname: String, pub ascii_hostname: String, pub punycode_encoded_url: String, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 874479c..72ad79a 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3,7 +3,7 @@ pub mod parse_from_text; pub mod link_url; mod utils; -pub use crate::parser::link_url::LinkDestination; +pub use crate::parser::link_url::{LinkDestination, PunycodeWarning}; /// The representation of Elements for the Abstract Syntax Tree #[derive(Debug, PartialEq, Eq, Serialize)] @@ -66,6 +66,5 @@ pub fn parse_only_text(input: &str) -> std::vec::Vec { /// parses text and delimited/labled link elements to replicate current desktop elements pub fn parse_desktop_set(input: &str) -> std::vec::Vec { - let x: LinkDestination; parse_from_text::parse_desktop_set(input) } diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 7cb98c2..67e2221 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -15,7 +15,7 @@ use crate::parser::{ }, base_parsers::direct_delimited, Element, - } + }, utils::{ is_white_space, is_white_space_but_not_linebreak, diff --git a/tests/text_to_ast/desktop_set.rs b/tests/text_to_ast/desktop_set.rs index 3407f7a..89252aa 100644 --- a/tests/text_to_ast/desktop_set.rs +++ b/tests/text_to_ast/desktop_set.rs @@ -170,62 +170,114 @@ fn email_address_example() { ] ); } - #[test] fn link() { - let test_cases_no_puny_code = vec![ - "http://delta.chat", - "http://delta.chat:8080", - "http://localhost", - "http://127.0.0.0", - "https://delta.chat", - "ftp://delta.chat", - "https://delta.chat/en/help", - "https://delta.chat/en/help?hi=5&e=4", - "https://delta.chat?hi=5&e=4", - "https://delta.chat/en/help?hi=5&e=4#section2.0", - "http://delta.chat:8080?hi=5&e=4#section2.0", - "http://delta.chat:8080#section2.0", - "mailto:delta@example.com", - "mailto:delta@example.com?subject=hi&body=hello%20world", + let test_cases_no_puny = vec![ + ( + "http://delta.chat", + http_link_no_puny("http://delta.chat", "delta.chat"), + ), + ( + "http://delta.chat:8080", + http_link_no_puny("http://delta.chat:8080", "delta.chat"), + ), + ( + "http://localhost", + http_link_no_puny("http://localhost", "localhost"), + ), + ( + "http://127.0.0.1", + http_link_no_puny("http://127.0.0.1", "127.0.0.1"), + ), + ( + "https://delta.chat", + https_link_no_puny("http://delta.chat", "delta.chat"), + ), + ( + "ftp://delta.chat", + ftp_link_no_puny("ftp://delta.chat", "delta.chat"), + ), + ( + "https://delta.chat/en/help", + https_link_no_puny("https://delta.chat/en/help", "delta.chat"), + ), + ( + "https://delta.chat?hi=5&e=4", + https_link_no_puny("https://delta.chat?hi=5&e=4", "delta.chat"), + ), + ( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + https_link_no_puny("https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat"), + ), + ( + "https://delta#section2.0", + https_link_no_puny("https://delta#section2.0", "delta"), + ), + ( + "http://delta.chat:8080?hi=5&e=4#section2.0", + http_link_no_puny("http://delta.chat:8080?hi=5&e=4#section2.0", "delta.chat"), + ), + ( + "http://delta.chat:8080#section2.0", + http_link_no_puny("http://delta.chat:8080#section2.0", "delta.chat"), + ), + ( + "mailto:delta@example.com", + mailto_link_no_puny("mailto:delta@example.com", "example.com"), + ), + ( + "mailto:delta@example.com?subject=hi&body=hello%20world", + mailto_link_no_puny("mailto:delta@example.com?subject=hi&body=hello%20world", "example.com"), + ), ]; - let test_cases_with_punycode = vec![ - "mailto:foö@ü.chat", - "https://ü.app#help", - "https://delta#section2.0", + + let test_cases_with_puny = [ + ( + "mailto:foö@ü.chat", + mailto_link_no_puny("mailto:foö@ü.chat", "ü.chat"), + ), + ( + "https://ü.app#help", + https_link_no_puny("https://ü.app#help", "ü.app") + ) ]; - for input in &test_cases_no_puny_code { + + for (input, destination) in &test_cases_no_puny { println!("testing {input}"); assert_eq!( parse_desktop_set(input), vec![Link { - destination: link_destination_for_testing(input) + destination: *destination }] ); - let result = parse_desktop_set(input); - assert_eq!(result.len(), 1); - assert!(matches!( - result[0], - Link { - destination: LinkDestination { - target: _, - punycode: None, - hostname: _, - scheme: _, - } - } - )); } - for input in &test_cases_with_punycode { + for (input, expected_destination) in &test_cases_with_puny { println!("testing {input}"); - assert_eq!( - parse_desktop_set(input), - vec![Link { - destination: link_destination_for_testing(input), - }] - ); + match &parse_desktop_set(input)[0] { + Link { destination } => { + assert_eq!( + expected_destination.target, + destination.target + ); + assert_eq!( + expected_destination.scheme, + destination.scheme + ); + assert_eq!( + expected_destination.hostname, + destination.hostname, + ); + assert_eq!( + destination.punycode.is_some(), + true + ); + } + _ => { + panic!(); + } + } } } @@ -238,8 +290,9 @@ fn test_link_example() { vec![ Text("This is an my site: "), Link { - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat" ) }, Linebreak, @@ -267,8 +320,9 @@ fn labeled_link_should_not_work() { parse_desktop_set("[a link](https://delta.chat/en/help?hi=5&e=4#section2.0)"), vec![LabeledLink { label: vec![Text("a link")], - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat", ) }] ); @@ -278,8 +332,9 @@ fn labeled_link_should_not_work() { ), vec![LabeledLink { label: vec![Text("rich content "), Bold(vec![Text("bold")])], - destination: link_destination_for_testing( - "https://delta.chat/en/help?hi=5&e=4#section2.0" + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat", ) }] ); @@ -293,7 +348,7 @@ fn labeled_link_example_should_not_work() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: link_destination_for_testing("https://delta.chat/en/help") + destination: https_link_no_puny("https://delta.chat/en/help", "delta.chat") }, Text(".") ] @@ -306,7 +361,7 @@ fn inline_link_do_not_eat_last_char_if_it_is_special() { parse_desktop_set("https://delta.chat,"), vec![ Link { - destination: link_destination_for_testing("https://delta.chat") + destination: https_link_no_puny("https://delta.chat", "delta.chat") }, Text(",") ] @@ -315,7 +370,7 @@ fn inline_link_do_not_eat_last_char_if_it_is_special() { parse_desktop_set("https://delta.chat."), vec![ Link { - destination: link_destination_for_testing("https://delta.chat") + destination: https_link_no_puny("https://delta.chat", "delta.chat") }, Text(".") ] @@ -323,7 +378,7 @@ fn inline_link_do_not_eat_last_char_if_it_is_special() { assert_eq!( parse_desktop_set("https://delta.chat/page.hi"), vec![Link { - destination: link_destination_for_testing("https://delta.chat/page.hi") + destination: https_link_no_puny("https://delta.chat/page.hi", "delta.chat") }] ); } diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index 9288c11..57a6762 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -1,6 +1,5 @@ use super::*; use deltachat_message_parser::parser::parse_markdown_text; -use deltachat_message_parser::parser::LinkDestination; #[test] fn bold_capitalized_command_suggestion() { @@ -565,35 +564,46 @@ fn link() { ]; - for (input, destination) in &test_cases_no_puny { - println!("testing {}", input); + for (input, expected_destination) in &test_cases_no_puny { + println!("testing {input}"); + let result = parse_markdown_text(input); assert_eq!( - parse_markdown_text(input), - vec![Link { - destination: *destination - }] + result.len(), + 1 + ); + assert_eq!( + result[0], + Link { + destination: *expected_destination.clone() + } ); } - for (input, destination) in &test_cases_with_puny { + for (input, expected_destination) in &test_cases_with_puny { println!("testing <{}>", input); - let result = parse_markdown_text(input)[0].destination; - assert_eq!( - result.target, - destination.target - ); - assert_eq!( - result.scheme, - destination.scheme - ); - assert_eq!( - result.hostname, - destination.hostname, - ); - assert_eq!( - result.punycode.is_some(), - true - ); + match &parse_markdown_text(input)[0] { + Link { destination } => { + assert_eq!( + expected_destination.target, + destination.target + ); + assert_eq!( + expected_destination.scheme, + destination.scheme + ); + assert_eq!( + expected_destination.hostname, + destination.hostname, + ); + assert_eq!( + destination.punycode.is_some(), + true + ); + } + _ => { + panic!(); + } + } } } @@ -605,10 +615,10 @@ fn test_link_example() { ), vec![ Text("This is an my site: "), - http_link_no_puny( + Link { destination: http_link_no_puny( "https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat" - ), + )}, Linebreak, Text("Visit me there") ] @@ -636,10 +646,10 @@ fn test_delimited_link_example() { ), vec![ Text("This is an my site: "), - https_link_no_puny( + Link { destination: https_link_no_puny( "https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat" - ), + )}, Linebreak, Text("Visit me there") ] diff --git a/tests/text_to_ast/mod.rs b/tests/text_to_ast/mod.rs index 3a3f3cc..8bcb021 100644 --- a/tests/text_to_ast/mod.rs +++ b/tests/text_to_ast/mod.rs @@ -1,7 +1,7 @@ use deltachat_message_parser::parser::Element::*; -use deltachat_message_parser::parser::{LinkDestination, PunycodeWarning}; +use deltachat_message_parser::parser::LinkDestination; -fn http_link_no_puny(target: &str, hostname: &str) -> LinkDestination { +fn http_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<'a> { LinkDestination { target, hostname: Some(hostname), @@ -10,25 +10,25 @@ fn http_link_no_puny(target: &str, hostname: &str) -> LinkDestination { } } -fn https_link_no_puny(target: &str, hostname: &str) -> LinkDestination { +fn ftp_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<'a> { LinkDestination { target, hostname: Some(hostname), - scheme: "http", + scheme: "ftp", punycode: None } } -fn http_link_no_puny(target: &str, hostname: &str) -> LinkDestination { +fn https_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<'a> { LinkDestination { target, hostname: Some(hostname), - scheme: "ftp", + scheme: "http", punycode: None } } -fn mailto_link_no_puny(target: &str, hostname: &str) -> LinkDestination { +fn mailto_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<'a> { LinkDestination { target, hostname: Some(hostname), diff --git a/tests/text_to_ast/text_only.rs b/tests/text_to_ast/text_only.rs index a6bdbb2..0ba85a2 100644 --- a/tests/text_to_ast/text_only.rs +++ b/tests/text_to_ast/text_only.rs @@ -1,5 +1,5 @@ use super::*; -use deltachat_message_parser::parser::{parse_only_text, LinkDestination}; +use deltachat_message_parser::parser::parse_only_text; #[test] fn do_not_parse_markdown_elements() { From 3d77f757eda56fb70b3c9d8c1e6d700210e08c69 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 28 Apr 2024 19:44:07 +0330 Subject: [PATCH 46/74] refactoring complete. now let's pass tests --- src/parser/link_url/mod.rs | 5 +++-- tests/text_to_ast/desktop_set.rs | 2 +- tests/text_to_ast/markdown.rs | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 2f7ac88..d0ea554 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -25,7 +25,7 @@ use crate::parser::{ // - Every other url (like mailto) // [1] RFC1738(Section 3.1), RFC3987, RFC3988 --Farooq -#[derive(Debug, PartialEq, Eq, Serialize)] +#[derive(Debug, PartialEq, Eq, Serialize, Clone)] pub struct LinkDestination<'a> { pub target: &'a str, /// hostname if it was found @@ -37,13 +37,14 @@ pub struct LinkDestination<'a> { pub scheme: &'a str, } -#[derive(Debug, PartialEq, Eq, Serialize)] +#[derive(Debug, PartialEq, Eq, Serialize, Clone)] pub struct PunycodeWarning { pub original_hostname: String, pub ascii_hostname: String, pub punycode_encoded_url: String, } + impl LinkDestination<'_> { /// parse a link that is not in a delimited link or a labled link, just a part of normal text /// it has a whitelist of schemes, because otherwise diff --git a/tests/text_to_ast/desktop_set.rs b/tests/text_to_ast/desktop_set.rs index 89252aa..8633555 100644 --- a/tests/text_to_ast/desktop_set.rs +++ b/tests/text_to_ast/desktop_set.rs @@ -248,7 +248,7 @@ fn link() { assert_eq!( parse_desktop_set(input), vec![Link { - destination: *destination + destination: destination.clone() }] ); } diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index 57a6762..0e46247 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -574,13 +574,13 @@ fn link() { assert_eq!( result[0], Link { - destination: *expected_destination.clone() + destination: expected_destination.clone() } ); } for (input, expected_destination) in &test_cases_with_puny { - println!("testing <{}>", input); + println!("testing {}", input); match &parse_markdown_text(input)[0] { Link { destination } => { assert_eq!( From 1a631f7ae05fe87a61c8e3e4a2c2c0d7fdd9a18c Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 28 Apr 2024 20:39:12 +0330 Subject: [PATCH 47/74] fixed all testcases --- tests/text_to_ast/desktop_set.rs | 14 +++++++------- tests/text_to_ast/markdown.rs | 21 +++++++++++---------- tests/text_to_ast/mod.rs | 6 +++--- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/tests/text_to_ast/desktop_set.rs b/tests/text_to_ast/desktop_set.rs index 8633555..d6bd311 100644 --- a/tests/text_to_ast/desktop_set.rs +++ b/tests/text_to_ast/desktop_set.rs @@ -191,7 +191,7 @@ fn link() { ), ( "https://delta.chat", - https_link_no_puny("http://delta.chat", "delta.chat"), + https_link_no_puny("https://delta.chat", "delta.chat"), ), ( "ftp://delta.chat", @@ -223,19 +223,19 @@ fn link() { ), ( "mailto:delta@example.com", - mailto_link_no_puny("mailto:delta@example.com", "example.com"), + mailto_link_no_puny("mailto:delta@example.com"), ), ( "mailto:delta@example.com?subject=hi&body=hello%20world", - mailto_link_no_puny("mailto:delta@example.com?subject=hi&body=hello%20world", "example.com"), + mailto_link_no_puny("mailto:delta@example.com?subject=hi&body=hello%20world"), ), - ]; - - let test_cases_with_puny = [ ( "mailto:foö@ü.chat", - mailto_link_no_puny("mailto:foö@ü.chat", "ü.chat"), + mailto_link_no_puny("mailto:foö@ü.chat"), ), + ]; + + let test_cases_with_puny = [ ( "https://ü.app#help", https_link_no_puny("https://ü.app#help", "ü.app") diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index 0e46247..dd786dc 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -512,7 +512,7 @@ fn link() { ), ( "https://delta.chat", - https_link_no_puny("http://delta.chat", "delta.chat"), + https_link_no_puny("https://delta.chat", "delta.chat"), ), ( "ftp://delta.chat", @@ -544,19 +544,19 @@ fn link() { ), ( "mailto:delta@example.com", - mailto_link_no_puny("mailto:delta@example.com", "example.com"), + mailto_link_no_puny("mailto:delta@example.com"), ), ( "mailto:delta@example.com?subject=hi&body=hello%20world", - mailto_link_no_puny("mailto:delta@example.com?subject=hi&body=hello%20world", "example.com"), + mailto_link_no_puny("mailto:delta@example.com?subject=hi&body=hello%20world"), ), - ]; - - let test_cases_with_puny = [ ( "mailto:foö@ü.chat", - mailto_link_no_puny("mailto:foö@ü.chat", "ü.chat"), + mailto_link_no_puny("mailto:foö@ü.chat"), ), + ]; + + let test_cases_with_puny = [ ( "https://ü.app#help", https_link_no_puny("https://ü.app#help", "ü.app") @@ -615,7 +615,7 @@ fn test_link_example() { ), vec![ Text("This is an my site: "), - Link { destination: http_link_no_puny( + Link { destination: https_link_no_puny( "https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat" )}, @@ -645,11 +645,12 @@ fn test_delimited_link_example() { "This is an my site: \nVisit me there" ), vec![ - Text("This is an my site: "), + Text("This is an my site: <"), Link { destination: https_link_no_puny( "https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat" )}, + Text(">"), Linebreak, Text("Visit me there") ] @@ -691,7 +692,7 @@ fn labeled_link_example() { LabeledLink { label: vec![Text("here")], destination: https_link_no_puny( - "https://delta.chat/en/help?hi=5&e=4#section2.0", + "https://delta.chat/en/help", "delta.chat" ), }, diff --git a/tests/text_to_ast/mod.rs b/tests/text_to_ast/mod.rs index 8bcb021..e38532a 100644 --- a/tests/text_to_ast/mod.rs +++ b/tests/text_to_ast/mod.rs @@ -23,15 +23,15 @@ fn https_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination LinkDestination { target, hostname: Some(hostname), - scheme: "http", + scheme: "https", punycode: None } } -fn mailto_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<'a> { +fn mailto_link_no_puny<'a>(target: &'a str) -> LinkDestination<'a> { LinkDestination { target, - hostname: Some(hostname), + hostname: None, scheme: "mailto", punycode: None, } From eb79b92089452de7306c6859374fd9351ab64256 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 28 Apr 2024 20:43:28 +0330 Subject: [PATCH 48/74] change link_url to parse_link --- src/parser/link_url/mod.rs | 4 ++-- src/parser/link_url/{link_url.rs => parse_link.rs} | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename src/parser/link_url/{link_url.rs => parse_link.rs} (99%) diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index d0ea554..1bb4aa1 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -1,4 +1,4 @@ -mod link_url; +mod parse_link; mod ip; use nom::{ @@ -9,7 +9,7 @@ use nom::{ use crate::parser::{ parse_from_text::base_parsers::CustomError, - link_url::link_url::parse_link, + link_url::parse_link::parse_link, }; diff --git a/src/parser/link_url/link_url.rs b/src/parser/link_url/parse_link.rs similarity index 99% rename from src/parser/link_url/link_url.rs rename to src/parser/link_url/parse_link.rs index c10dbe7..10b129e 100644 --- a/src/parser/link_url/link_url.rs +++ b/src/parser/link_url/parse_link.rs @@ -449,7 +449,7 @@ pub(super) fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomEr #[cfg(test)] mod test { #![allow(clippy::unwrap_used)] - use crate::parser::{LinkDestination, link_url::link_url::{punycode_encode, PunycodeWarning}}; + use crate::parser::{LinkDestination, link_url::parse_link::{punycode_encode, PunycodeWarning}}; #[test] fn basic_parsing() { From db25cad15285dcfda57ff98cc1341f2cd3d347b7 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 28 Apr 2024 21:10:04 +0330 Subject: [PATCH 49/74] new data for benchmark --- benches/testdata.md | 760 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 760 insertions(+) diff --git a/benches/testdata.md b/benches/testdata.md index ed9b973..eb9773b 100644 --- a/benches/testdata.md +++ b/benches/testdata.md @@ -10,6 +10,766 @@ http://delta.chat?test=1234&y=4 https://delta.chat/hello?test=1234&y=4 + + Then a text containing a delimited link then a [labeled link](https://delta.chat/hello?test=1234&y=4) and a #hashtag, cause why not. `inline code` and more useless text: 1+1 != 3 ; what a user may or may not write in a message somehow. + + +tons of data from awesome bitcoin cash list: + +
+ awesome bitcoin cash +
+
+
+A curated list of Bitcoin Cash projects & resources
+ + awesome + + +
+Bitcoin Cash (BCH) is a project to scale bitcoin on-chain as an electronic peer-to-peer payment system for the world. 🚀 + +
+
+ +📤 [a mobile friendly version](https://awesomebitcoin.cash) of this [project](https://github.com/2qx/awesome-bitcoin-cash) is formatted [from markdown](https://github.com/2qx/awesome-bitcoin-cash/blob/master/README.md) by github pages. + +Pull requests are welcome, please see [the contribution guidelines](CONTRIBUTING.md). +
+ +[![Check Links](https://github.com/2qx/awesome-bitcoin-cash/actions/workflows/links.yml/badge.svg)](https://github.com/2qx/awesome-bitcoin-cash/actions/workflows/links.yml) + + +# Contents + +- [Contents](#contents) +- [Getting Started](#getting-started) +- [State of the Project](#state-of-the-project) +- [Whitepaper](#whitepaper) +- [Open-Source Wallets](#open-source-wallets) + - [Mobile](#mobile) + - [Desktop](#desktop) + - [Electron-Cash Plugins](#electron-cash-plugins) + - [Cli](#cli) + - [Browser](#browser) + - [Paper/Offline Generator](#paperoffline-generator) +- [Podcasts, News, Media](#podcasts-news-media) +- [Projects Built on Bitcoin Cash](#projects-built-on-bitcoin-cash) + - [Apps (Social)](#apps-social) + - [Crowdfunding](#crowdfunding) + - [BCH Native Decentralized Finance](#bch-native-decentralized-finance) + - [Collectables](#collectables) + - [Entertainment](#entertainment) + - [Exchanges](#exchanges) + - [Centralized](#centralized) + - [More decentralized](#more-decentralized) + - [Oracles](#oracles) + - [Faucets](#faucets) + - [Network](#network) + - [Explorers](#explorers) + - [Testnet Explorers](#testnet-explorers) + - [Services](#services) + - [Utilities](#utilities) + - [Web](#web) + - [See Also](#see-also) +- [Merchants and Services Accepting Bitcoin Cash](#merchants-and-services-accepting-bitcoin-cash) + - [A Short List](#a-short-list) + - [Geographic lists](#geographic-lists) + - [Projects dedicated to listing or enabling eCommerce.](#projects-dedicated-to-listing-or-enabling-ecommerce) + - [Some Charities and Foundations](#some-charities-and-foundations) +- [eCommerce Merchant Resources](#ecommerce-merchant-resources) + - [Bitcoin Cash Open-Source plugins](#bitcoin-cash-open-source-plugins) + - [Point of Sale Clients](#point-of-sale-clients) + - [Non-Custodial Payment Processors](#non-custodial-payment-processors) + - [BCH-to-Fiat Payment Processors](#bch-to-fiat-payment-processors) + - [Payment Processor Status](#payment-processor-status) +- [Documentation](#documentation) + - [General](#general) + - [Base Protocol](#base-protocol) + - [Secondary protocols](#secondary-protocols) + - [Discussion](#discussion) + - [CHIP Process](#chip-process) + - [Previous consensus changes, May 2023:](#previous-consensus-changes-may-2023) + - [Bitcoin Script](#bitcoin-script) +- [Software](#software) + - [Full Nodes](#full-nodes) + - [Developer Resources](#developer-resources) + - [Open-Source Teams Building on Bitcoin Cash](#open-source-teams-building-on-bitcoin-cash) + - [Simple Payment Verification (SPV)](#simple-payment-verification-spv) + - [Libraries \& SDKs](#libraries--sdks) + - [Language Agnostic](#language-agnostic) + - [Typescript](#typescript) + - [Javascript](#javascript) + - [Python](#python) + - [Rust](#rust) + - [Java](#java) + - [C](#c) + - [PHP](#php) + - [R](#r) +- [Endorsements](#endorsements) + - [The Adaptive Blocksize Limit Algorithm (ebaa) CHIP for the May 2024 BCH Upgrade is AWESOME!](#the-adaptive-blocksize-limit-algorithm-ebaa-chip-for-the-may-2024-bch-upgrade-is-awesome) + - [The CashTokens and P2SH32 CHIP Proposals for the May 2023 BCH Upgrade are AWESOME!](#the-cashtokens-and-p2sh32-chip-proposals-for-the-may-2023-bch-upgrade-are-awesome) +- [The Archive](#the-archive) + - [Bitcoin Script tools](#bitcoin-script-tools) + - [Simple Ledger Protocol (SLP Token)](#simple-ledger-protocol-slp-token) + - [Protocols](#protocols) + - [Libraries](#libraries) + - [SLP Token Projects](#slp-token-projects) + +# Getting Started + +- [bitcoincash.org](https://bitcoincash.org) - A general multi-lingual introduction. +- [BCH Info](https://bch.info/) - Multilingual site for general information about bitcoin cash. +- [BCHFAQ.com](https://bchfaq.com/) [[code]](https://github.com/fixthetracking/Bitcoin-Cash-FAQ) - Learn the fundamentals of Bitcoin Cash by getting simple answers to your basic questions. +- [Why Bitcoin Cash?](https://whybitcoincash.com/) [[archive]](https://web.archive.org/web/20230228125654/https://whybitcoincash.com/) - The revolution will not be censored. +- [Bitcoin.com Getting Started](https://www.bitcoin.com/get-started/) - Comprehensive introduction for general audiences. +- [Why Cryptocurrencies?](https://whycryptocurrencies.com/toc.html) [[code]](https://github.com/treeman/why_cryptocurrencies) - An explanation on why cryptocurrencies were created, what they do differently and why they matter. + +# State of the Project + +- [Three Years In: A Bitcoin Cash Update From One of Its Founders](https://news.bitcoin.com/three-years-in-a-bitcoin-cash-update-from-one-of-its-founders/) - by Jonald Fyookball + +# Whitepaper + +"Bitcoin: A Peer-to-Peer Electronic Cash System" by Satoshi Nakamoto. + +Bitcoin Cash is one chain of Satoshi Nakamoto's blockchain invention which was deliberately hard-forked on August 1st, 2017. It shares the whitepaper, first block, and all bitcoin block history prior to the fork. It attempts to implement the central idea outlined in that paper. + +Below is a copy of the original nine page whitepaper: + +- [Archived copy](https://web.archive.org/web/20100704213649if_/http://www.bitcoin.org:80/bitcoin.pdf) of the bitcoin whitepaper from bitcoin.org. +- [bitcoin whitepaper](https://gateway.ipfs.io/ipfs/QmRA3NWM82ZGynMbYzAgYTSXCVM14Wx1RZ8fKP42G6gjgj) via ipfs. +- Websites hosting the bitcoin whitepaper [[wayback archive]](http://web.archive.org/web/20210516141704if_/https://blockchair.com/bitcoin/whitepaper), with sha256 hashes calculated as of May 16th 2021. +- [As a webcomic](https://web.archive.org/web/20230215013643/https://whitepaper.coinspice.io/) [[中文]](https://web.archive.org/web/20230315051200/https://whitepaper.coinspice.io/cn) [[日本語]](https://web.archive.org/web/20200217125719/https://www.bitcoin.jp/what-is-bitcoin/bitcoin-whitepaper-comic/) - Bitcoin Whitepaper web comic by Scott McCloud. +- [Instructions and code](https://bitcoin.stackexchange.com/questions/35959/how-is-the-whitepaper-decoded-from-the-blockchain-tx-with-1000x-m-of-n-multisi) for building the original paper encoded on the blockchain on 2013-04-06. + +# Open-Source Wallets + +Below are non-custodial open-source wallets that use features specific to Bitcoin Cash. + +**[Best BCH Wallets](https://www.bestbchwallets.com)** is a tool for selecting a wallet based on operating system and features. + +## Mobile + +- 🔵 [Electron-Cash](https://electroncash.org) - Android [[code]](https://github.com/Electron-Cash/Electron-Cash/tree/master/android) and iOS [[code]](https://github.com/Electron-Cash/Electron-Cash/tree/master/ios) versions available with more limited functionality. +- 🔵 [Paytaca](https://www.paytaca.com/) [[apk]](https://github.com/paytaca/paytaca-app/releases) [[code]](https://github.com/paytaca/paytaca-app) - A mobile wallet for Android, iOS and ChromeOS +- [Flowee Pay](https://flowee.org/products/pay/) [[code]](https://codeberg.org/Flowee/pay/) [[apk]](https://flowee.org/products/pay/) [[docs]](https://codeberg.org/Flowee/Pay/wiki) - A user friendly wallet for Android and Linux desktop. +- [Selene Wallet](https://selene.cash/) [[code]](https://git.xulu.tech/selene.cash/selene-wallet/) - Easy, no-hassle, instant payments in the palm of your hand. +- [Stack Wallet](https://stackwallet.com/) [[code]](https://github.com/cypherstack/stack_wallet) - Multicoin wallet with UTXO (coin) control. +- [Cake Wallet](https://cakewallet.com/) [[code]](https://github.com/cake-tech/cake_wallet) [[apk]](https://github.com/cake-tech/cake_wallet/releases) - An open source wallet for iOS and Android supporting XMR and other currencies. +- 🔵 [zapit](https://zapit.io/#/)* - A native, non-custodial Bitcoin Cash wallet for iOS and Android. *Not open source + +## Desktop +- 🔵 [Electron Cash CashToken](https://electroncash.org) [[release]](https://github.com/Electron-Cash/Electron-Cash/releases/tag/4.3.0) [[code]](https://github.com/Electron-Cash/Electron-Cash/) - Electron Cash with CashTokens. +- [Flowee Pay](https://flowee.org/products/pay/) [[code]](https://codeberg.org/flowee/pay) - A payment solution, a wallet, a basis for your new product. But currently just a desktop wallet. +- 🔵 [Cashonize (quasar)](https://github.com/cashonize/cashonize-quasar/releases/tag/v0.0.2) [[code]](https://github.com/cashonize/cashonize-quasar) - Cashonize rewrite with Quasar & Vue-js + +### Electron-Cash Plugins + +- [Flipstarter Plugin](https://gitlab.com/flipstarter/flipstarter-electron-cash) - plugin for crowdfunding. +- [Nostron](https://github.com/Electron-Cash/Nostron/) - Nostron is a plugin for the Electron-Cash BCH wallet. +- [Inter-Wallet Transfer plugin](https://github.com/KarolTrzeszczkowski/Inter-Wallet-Transfer-EC-plugin) - A plugin, that sends your coins to another wallet one by one, every time to a fresh address. +- [Mecenas Plugin](https://github.com/KarolTrzeszczkowski/Mecenas-recurring-payment-EC-plugin/releases) - recurring payments. +- [Last Will](https://github.com/KarolTrzeszczkowski/Electron-Cash-Last-Will-Plugin) - dead man smart contract creation. +- [HODL](https://github.com/mainnet-pat/hodl_ec_plugin/) - smart contract plugin for Electron Cash to timelock funds. +- [AutoCove](https://github.com/TinosNitso/AutoCove-Plugin) - Electrum-cash script decoder. + +## Cli + +- [bitcore-wallet](https://github.com/bitpay/bitcore/tree/master/packages/bitcore-wallet) - A command line wallet used for BitPay wallets. + +## Browser +- 🔵 [Cashonize](https://cashonize.com/) [[code]](https://github.com/cashonize/wallet) - An experimental web wallet for CashTokens. +- [PSF wallet](https://wallet.fullstack.cash/) [[code]](https://github.com/Permissionless-Software-Foundation/gatsby-ipfs-web-wallet) - An web wallet with SLP support. +- 🔵 [Microfi Wallet](https://microfi.eu/wallet/) - Microfi Free Flow Wallet +- [BCH Merchant PoS](https://pos.cash) [[code]](https://github.com/softwareverde/pos-cash) - Bitcoin Cash Web Point of Sale, from SoftwareVerde. + +## Paper/Offline Generator + +- [Cash Address Generator](https://cashaddress.org/) [[code]](https://github.com/theantnest/bccaddress) - reputable javascript address generator suitable for offline use. +- [Bitcoin.com Paper Wallet](https://paperwallet.bitcoin.com/) [[code]](https://github.com/Bitcoin-com/paperwallet.bitcoin.com) - A fork of the cashaddress.org paper wallet +- Keep Bitcoin Free Paper Wallet [[code]](https://github.com/KeepBitcoinFree-org/paper.keepbitcoinfree.org) - A fork of the Bitcoin.com paper wallet +- [BCH Gifts](https://gifts.bitcoin.com/) - generate reclaimable preloaded paper private keys as gifts. + +# Podcasts, News, Media + +Bitcoin Cash focussed media and content. + +- [The Bitcoin Cash Podcast](https://www.bitcoincashpodcast.com) - Available on [Youtube](https://www.youtube.com/channel/UCsrDsJnHFnkMnJhEslofyPQ) and [RSS](https://rss.com/podcasts/bitcoincashpodcast/) audio versions, plus other video and podcast platforms (see links at bottom of website). +- [Bitcoin Cash Foundation](https://bitcoincashfoundation.org/) Weekly News - Available on [Youtube](https://www.youtube.com/@BitcoinCashFoundation) and [Telegram](https://t.me/BCHFNews) +- General Protocol Spaces - Available on [Youtube](https://www.youtube.com/watch?v=707-DPzhdA8&list=PLcIK2erO9hWyM56FYkUAilfUmABbwpu7U) and twitter. + + +# Projects Built on Bitcoin Cash + +All of these apps are mostly stable and active. Always check the notes of a particular project before risking a large sum of value. Links are checked on a weekly basis, but function is not checked. + +## Apps (Social) + +- [read.cash](https://read.cash) - a conventionally hosted long-format blogging platform, with BCH tipping for content. +- [memo.cash](https://memo.cash) - short message social media site with decentralized SLP token exchange. +- [Cashrain](https://cashrain.com/) - A platform where creators create communities for their members. +- [noise.app](https://noise.app) - An invite only Bitcoin Cash powered micro-blogging platform. +- [OnlyCoins](https://onlycoins.com/) - Adult content monetization platform. +- [Glimpse.cash](https://glimpse.cash/) - A pay per view video hosting and streaming platform. +- [Gaze.cash](https://gaze.cash/) - A more lenient pay-per-view video platform. +- [WhoTipped.it](https://whotipped.it/) - Last tips given on memo.cash + +## Crowdfunding + +- [flipstarter](https://flipstarter.cash/) [[Introduction]](https://read.cash/@flipstarter/introducing-flipstarter-695d4d50) [[code]](https://gitlab.com/flipstarter/backend) - a crowd funding app using anyone can pay multisig transactions. +- IPFS Flipstarter [[code]](https://gitlab.com/ipfs-flipstarter) - An IPFS flipstarter campaign site. + +## BCH Native Decentralized Finance + +[DefiLlama](https://defillama.com/chain/Bitcoincash) - Statistics for Bitcoin Cash Defi. + +- [BCH Bull](https://bchbull.com/) [[app]](https://app.bchbull.com/) - Permissionless leverage and hedging using the Anyhedge protocol. +- 🔵 [TapSwap](https://tapswap.cash/) - An open marketplace for fungible and non-fungible tokens. +- 🔵 [Cauldron](https://www.cauldron.quest/) [[whitepaper]](https://www.cauldron.quest/_files/ugd/ae85be_b1dc04d2b6b94ab5a200e3d8cd197aa3.pdf) - A Constant product market maker contract +- [Unspent](https://unspent.cash) [[code]](https://github.com/2qx/unspent) [[cli]](https://www.npmjs.com/package/unspent) [[docs]](https://unspent.app/documentation) - An irrevocable perpetuity app +- 🔵 [Emerald DAO](https://emerald-dao.cash/) [[app]](https://emerald-dao.vercel.app/) [[code]](https://gitlab.com/0353F40E/emerald-dao/) - A simple Bitcoin Cash DAO template which acts as a fixed-term deposit savings vault. +- 🔵 [Wrapped Cash](https://wrapped.cash/) [[code]](https://gitlab.com/dagurval/wrapped-cash) - Bitcoin Cash wrapped as a CashToken + + + +## Collectables + +- 🔵 [BCH Guru NFTs](https://nfts.bch.guru) - a premier collection of NFTs +- 🔵 [Ghostwriter](https://ghostwriter.pages.dev/) - Text based NFT minting +- 🔵 [Bitcats Heroes](https://bitcatsheroes.club/) - Collectibele NFT series with non-custodial minting contract. +- 🔵 [CashNinjas](https://ninjas.cash/) [[code]](https://github.com/cashninjas) - an NFT collection leveraging the new CashTokens technology. + + +## Entertainment + +- [bch.games](https://bch.games/) - dice and numbers game. +- 🔵 [BCU Guru](https://bch.guru) - A peer to peer price prediction game on Bitcoin Cash +- 🔵 [DogeCash](https://dogecash.uwu.ai/) - Don't let your dreams be memes +- [craft.cash](https://craft.cash/) [[code]](https://github.com/blockparty-sh/craft.cash) - Voxel world stored on Bitcoin Cash. +- [Satoshi dice](https://www.satoshidice.com/) - a provably fair dice game. +- [Spin BCH](https://SpinBCH.com) - Spinning wheel based gambling using zero-conf + +## Exchanges + +Bitcoin Cash is supported on hundreds of exchanges, these are a few. + +### Centralized + +- [CoinEx](https://www.coinex.com/) - A BCH friendly exchange with automatic coin-splitting + +### More decentralized + +- [Thorchain Swap](https://app.thorswap.finance/) - Swap native assets directly with any non-custodial wallet across nine blockchains. +- [Komodo Wallet](https://app.komodoplatform.com/) - Decentralized exchange with desktop clients supporting BCH and many UTXO coins, ETH, ERC-20 tokens + +## Oracles + +- [Oracles.Cash](https://oracles.cash/) [[Best Practices]](https://gitlab.com/GeneralProtocols/priceoracle/library#best-practices-for-price-oracle-consumers) [[spec]](https://gitlab.com/GeneralProtocols/priceoracle/specification) - Price oracles for Bitcoin Cash + +## Faucets + +- 🔵 [Testnet Faucet](https://tbch.googol.cash/) [[code]](https://gitlab.com/uak/light-crypto-faucet) +- 🔵 [`unspent`](https://www.npmjs.com/package/unspent?activeTab=readme) [[code]](https://github.com/2qx/unspent) - an javascript package with commands for faucets. +- BCH Testnet Faucet [[code]](https://github.com/christroutner/testnet-faucet2/) - Fullstack.cash faucet for tBCH. + +## Network + +- [fork.lol](https://fork.lol) - Site to monitor network health in relation to BTC. +- [Johoe's Bitcoin Mempool Statistics](https://jochen-hoenicke.de/queue/) [[code]](https://github.com/jhoenicke/mempool) - Colorful mempool graphs. +- [Electrum Server Status for BCH](https://1209k.com/bitcoin-eye/ele.php?chain=bch) [[or tBCH]](https://1209k.com/bitcoin-eye/ele.php?chain=tbch) - A 1209k hosted list of electrum servers +- [Tx Street](https://txcity.io/v/bch-eth) [[code]](https://github.com/txstreet/txstreet) - a live blockchain transaction and mempool visualizer. +- [Bitcoin Energy Statistics](https://www.monsterbitar.se/~jonathan/energy/) - A comparison of energy usage for BCH and BTC. + +### Explorers +- 🔵 [Blockchain Explorer](https://explorer.bch.ninja/) [[code]](https://github.com/sickpig/bch-rpc-explorer) [[mirror: BU]](https://explorer.bitcoinunlimited.info/) [[mirror: electroncash.de]](https://explorer.electroncash.de) - Database-free, self-hosted Bitcoin Cash explorer, via RPC. +- 🔵 [Bitcoin Cash Explorer](https://explorer.salemkode.com/) [[code]](https://github.com/salemkode/explorer) - A Bitcoin Cash Explorer with CashTokens, by SalemKode. +- 🔵 [3xpl.com BCH Explorer](https://3xpl.com/bitcoin-cash) [[code]](https://github.com/3xplcom)- Fastest ad-free universal block explorer. +- [BCH Explorer](https://explorer.melroy.org/) [[code]](https://gitlab.melroy.org/bitcoincash/explorer) - Bitcoin Cash Explorer by Melroy van den Berg +- [Blockchair BCH Explorer](https://blockchair.com/bitcoin-cash) - Universal blockchain explorer and search engine. +- [Blockchain.com BCH explorer](https://www.blockchain.com/explorer?view=bch) - Established blockchain explorer. +- 🔵 [BCH CashTokens NFT Viewer](https://viewer.sploit.cash) [[code]](https://github.com/acidsploit/cashtokens-nft-viewer) - Sploit's NFT viewer. + ### Testnet Explorers + - 🔵 [Chipnet (im_uname)](https://chipnet.imaginary.cash) + - 🔵 [Chipnet (chaingraph)](https://chipnet.chaingraph.cash) + - 🔵 [Chipnet (bch.ninja)](https://chipnet.bch.ninja) + - [Testnet [old]](https://texplorer.bitcoinunlimited.info/), [[mirror]](https://testnet-explorer.electroncash.de/) +- [Chaingraph](https://chaingraph.cash/) [[code]](https://github.com/bitauth/chaingraph) - A multi-node blockchain indexer and GraphQL API. +- [CoinGecko API](https://www.coingecko.com/api/documentation) - Free tier api for price data. +- [Blockchair Bulk Data](https://gz.blockchair.com/bitcoin-cash/) - Daily compressed dumps of blockchain data. +- [CashFusion Stats](https://fusionstats.redteam.cash/) - Data on privacy-enhancing CashFusion transactions. +- [Mempool Project](https://bchmempool.cash/) - A Bitcoin Cash (BCH) adaptation of the mempool open-source explorer. +- [bitcoinfees.cash](https://bitcoinfees.cash/) - bitcoin chain fee juxtaposition. + +## Services + +- 🔵 [OpenTokenRegistry](https://otr.cash/) [[code]](https://github.com/OpenTokenRegistry/otr.cash) - Community-Verified Token Information +- 🔵 [IPFS-BCH](https://ipfs-bch.pat.mn/) [[code]](https://github.com/mainnet-pat/ipfs-bch.pat.mn) - IPFS file pinning service with on-chain settlement +- [CashTags](https://tags.infra.cash/) [[code]](https://github.com/developers-cash/cashtags-server) - Service for printable QR Codes (Payment URLs) whose value amounts can be specified in fiat (e.g. USD). +- [SideShift.ai](https://sideshift.ai/) - enables HUMANS and AI to shift between 30+ cryptocurrencies. +- 🔵 [Token Stork](https://tokenstork.com/) - A CashToken market capitalization explorer. +- 🔵 [Token Explorer](https://tokenexplorer.cash/) - A Token explorer for CashTokens. +- [Chaintip Bounties](https://github.com/chaintip/bounties/blob/master/README.md#available-bounties) - BCH bot for github bounties. +- [BCH.gg](https://bch.gg/) - Bitcoin Cash URL Shortener + +## Utilities + +- [CashAccount](https://www.cashaccount.info/) - Online utility for cashaccounts (address handles). +- 🔵 [Bitauth IDE](https://ide.bitauth.com/) [[code]](https://github.com/bitauth/bitauth-ide) [[walk-thru]](https://www.youtube.com/watch?v=o-igo-adS8E) - An online IDE for developing Bitcoin Cash contracts. +- 🔵 [CashTokens Studio](https://cashtokens.studio/) - CashToken and Authkey creation tool ([chipnet](https://chipnet.cashtokens.studio/)) +- [Bitcoin.com Tools](https://tools.bitcoin.com/) - A mix of Bitcoin utilities. +- 🔵 [CashTokens Airdrop Tool](https://github.com/mr-zwets/airdrop-tool) - A command line utility to airdrop fungible tokens to NFT holders. + +## Web + +- [Bitcoin Paywall](https://wordpress.org/plugins/bitcoin-paywall/) [[code]](https://plugins.trac.wordpress.org/browser/bitcoin-paywall/) - Wordpress paywall plugin + +## See Also + +These are other projects dedicated to listing projects in the Bitcoin Cash ecosystem: + +- [HelpMe Cash](https://helpme.cash/) - A collection of links to things related to the cryptocurrency Bitcoin Cash +- [Bitcoin Cash Projects](https://www.bitcoin.com/bitcoin-cash-projects/) - maintained by bitcoin.com. +- [BCH Developments](https://keepbitcoinfree.org/bch-dev/) - list maintained by KeepBitcoinFree. +- [Canonical awesome-bitcoin-cash](https://github.com/dsmurrell/awesome-bitcoin-cash) - the original. +- [Mainnet Cash List](https://mainnet.cash/projects.html) - A list of projects maintained at mainnet.cash +- [BCHGANG Link Directory](https://bchgang.org) - A directory of links about the cryptocurrency Bitcoin Cash: wallets, merchants, exchanges, tools, references, block explorer, developer guides, tutorials and more. + +# Merchants and Services Accepting Bitcoin Cash + +## A Short List + +These vendors have accepted bitcoin for years and are committed (or sympathetic) toward the idea of electronic cash payments. + +Although some of these may appear to only accept Bitcoin (BTC), they do, in fact, accept Bitcoin Cash also. + +- [Namecheap](https://namecheap.com) - dns, ssl and some packaged hosting. +- [keys4coins](https://www.keys4coins.com/) - Buy PC games and gift cards with cryptocurrency. +- [alfa.top](https://alfa.top/) - Buy mobile top-up (credit) and internet with cryptocurrency. +- [CheapAir](https://www.cheapair.com) - for your travel needs. +- [Travala](https://www.travala.com) - for your travel needs. +- [items sold by Newegg](https://kb.newegg.com/knowledge-base/using-bitcoin-on-newegg/) - good for a great headset. + +## Geographic lists + +- [OpenStreetMap BCH Tag](https://overpass-turbo.eu/?w=%22currency%3ABCH%22%3D%22yes%22+global&R) - Entries tagged with `currency:BCH=yes` in OSM. +- [Bitcoin.com map](https://map.bitcoin.com/) - website and mobile app for discovering merchants, formerly marco coino. +- [Bmap.app](https://bmap.app/) - ₿itcoin places all around the world! +- [where2cash](https://where2.cash/) - Bitcoin Cash Map using OpenStreeMap data. +- [map.usecash](https://map.usecash.com)[[code]](https://github.com/modenero/use-cash) - Use Cash map built by Modenero. + +## Projects dedicated to listing or enabling eCommerce. + +- [Use.Cash](https://usecash.com/) - Guide for using cryptocurrency like cash. +- [Bitgree](https://www.bitgree.com) - service to privately purchase goods on Amazon.com and others at a discount. + +## Some Charities and Foundations + +Just some good charities for the world at large. + +- [Tails](https://tails.boum.org/donate/index.en.html) - The Amnesic Incognito Live System, is a security-focused Debian-based Linux distribution aimed at preserving privacy and anonymity. +- [Save the Children](https://files.savethechildren.org/cryptocurrency-donation/) - **A United Kingdom based charity, founded in 1919**, to improve the lives of children through better education, health care, and economic opportunities, as well as providing emergency aid in natural disasters, war, and other conflicts. (Cryptocurrency donations are powered by [The Giving Block](https://www.thegivingblock.com/)) +- [The Internet Archive](https://blockchair.com/bitcoin-cash/address/1Archive1n2C579dMsAu3iC6tWzuQJz8dN) - 1Archive1n2C579dMsAu3iC6tWzuQJz8dN +- [Bitpay Charity Directory](https://bitpay.com/directory/nonprofits) A list of charities that accept Bitcoin Cash and other cryptocurrencies. + +# eCommerce Merchant Resources + +## Bitcoin Cash Open-Source plugins + +- [CryptoWoo for WooCommerce](https://github.com/WeProgramIT/cryptowoo-bitcoin-cash-addon) - Bitcoin Cash integration for CryptoWoo + +## Point of Sale Clients + +- 🔵 [Paytaca](https://www.paytaca.com/) [[apk]](https://github.com/paytaca/paytaca-app/releases) [[code]](https://github.com/paytaca/paytaca-app) - A mobile wallet with integrated POS. +- [pos.cash](https://pos.cash) [[code]](https://github.com/softwareverde/pos-cash) - a non-custodial web-based point of sale BCH client. + +## Non-Custodial Payment Processors + +- [Prompt.cash](https://prompt.cash) [[demo]](https://www.youtube.com/watch?v=8TIpZW1P_9M) [[docs]](https://prompt.cash/pub/docs/#introduction) - a non-custodial Bitcoin Cash payment gateway +- [Cash Pay Server](https://github.com/developers-cash/cash-pay-server-js) [[docs]](https://developers-cash.github.io/cash-pay-server-js/) - a self-hostable NodeJS micro-service that can be used to handle BIP70 and JSON Payment Protocol invoices for Bitcoin Cash (BCH) + +## BCH-to-Fiat Payment Processors + +- [BitPay developer Integrations](https://bitpay.com/integrations/) [[api docs]](https://bitpay.com/docs) + +## Payment Processor Status + +- [status.bitpay.com](https://status.bitpay.com/) - Current status with recent incidents. + +# Documentation + +## General + +- [developers.cash](https://developers.cash/) - many useful resources +- [Permissionless Software Foundation Videos](https://psfoundation.cash/video/) +- [Electron Cash Wiki](https://wiki.electroncash.de/wiki/Main_Page) + +## Base Protocol + +- [BCH Specification](https://flowee.org/docs/spec/) - Specification hosted by flowee.org. +- [Bitcoin Cash Protocol Documentation](https://documentation.cash/) [[code]](https://github.com/SoftwareVerde/bitcoin-cash-specification) - maintained by Software Verde. +- [reference.cash](https://reference.cash) - protocol documentation +- [Upgrade specs](https://upgradespecs.bitcoincashnode.org/) - Bitcoin Cash upgrade specifications as implemented by BCHN. + +### Secondary protocols + +[Bitcoin Cash Standards](https://bitcoincashstandards.org) is a site dedicated to collecting, some of which are listed below: + +- [AnyHedge](https://anyhedge.com/) [[docs]](https://anyhedge.com/developers/) [[code]](https://gitlab.com/GeneralProtocols/anyhedge) - Decentralized hedge solution against arbitrary commodities for Bitcoin Cash +- 🔵 [Bitcoin Cash Metadata Registries (BCMR)](https://cashtokens.org/docs/bcmr/chip/) [[code]](https://github.com/bitjson/chip-bcmr) - A standard for sharing authenticated metadata between Bitcoin Cash wallets. +- [Cashaddr](https://upgradespecs.bitcoincashnode.org/cashaddr/) - Format for Bitcoin Cash addresses. +- [Cash Accounts](https://gitlab.com/cash-accounts/specification/blob/master/SPECIFICATION.md) - attach a human readable name to Bitcoin Cash addresses. +- CashFusion(https://cashfusion.org) [[spec]](https://github.com/cashshuffle/spec/blob/master/CASHFUSION.md) - a privacy protocol for privately and trustlessly joining coin amounts. +- [CashID](https://gitlab.com/cashid/protocol-specification) - Specification using Bitcoin Cash for secure authentication. +- 🔵 [CashTokens](https://cashtokens.org/) [[code]](https://github.com/cashtokens/cashtokens.org) - Specification for CashTokens. +- [Electrum Cash Protocol (Fulcrum)](https://electrum-cash-protocol.readthedocs.io/en/latest/) [[code]](https://github.com/cculianu/electrum-cash-protocol) - ElectrumX Protocol for [fulcrum](https://fulcrumserver.org) (UTXO indexer/SPV service). +- [Electrum Cash Protocol](https://bitcoincash.network/electrum/) [[code]](https://github.com/dagurval/electrum-cash-protocol) - Protocol for SPV clients and servers. +- [Payment Requests Specification (BIP-0070)](https://github.com/bitcoin/bips/blob/master/bip-0070.mediawiki) - For dealing with invoice style payments at specific amounts. +- [Price Oracle](https://gitlab.com/GeneralProtocols/priceoracle/specification) [[implementation]](https://gitlab.com/GeneralProtocols/priceoracle/library) - Price oracle. +- [Memo Protocol](https://memo.cash/protocol) - for the on-chain tweet style social media app. +- [CashShuffle](https://cashshuffle.com/) [[spec]](https://github.com/cashshuffle/spec/blob/master/SPECIFICATION.md) - a privacy protocol for combining transactions with others, splitting to the lowest common amount. + +## Discussion + +An archive of past and future ideas for Bitcoin Cash ongoing at Bitcoin Cash Research (BCR). Collaborating participants have recorded their thoughts and concerns about various potential ideas & implemented improvements. + +- [Bitcoin Cash Research](https://bitcoincashresearch.org/) - Site dedicated to technical discussion. + +## CHIP Process + +Protocol changes, software standards and application specifications may be proposed by anyone. The recommended process for consensus building and conflict reduction is known as the Cash Improvement Proposal (CHIP) Process. + +- [CHIP Guidelines](https://gitlab.com/ggriffith/cash-improvement-proposals/-/blob/master/CHIP-2020-11-CHIP-Guidelines.md) +- [CHIPs: A more detailed process recommendation](https://gitlab.com/im_uname/cash-improvement-proposals/-/blob/master/CHIPs.md) +- [CHIPs](https://bitcoincashresearch.org/c/chips/) - a dynamic list of proposed standards +- [List of CHIPs](https://bch.info/chips) - documents that record proposals to upgrade the Bitcoin Cash protocol, and their ongoing progress, both technical and consensus-building. + +### Previous consensus changes, May 2023: + +- [CHIP-2021-01 Restrict Transaction Version (v1.0)](https://gitlab.com/bitcoin.cash/chips/-/blob/master/CHIP-2021-01-Restrict%20Transaction%20Versions.md) +- [CHIP-2021-01 Minimum Transaction Size (v0.4)](https://gitlab.com/bitcoin.cash/chips/-/blob/master/CHIP-2021-01-Allow%20Smaller%20Transactions.md) +- [CHIP-2022-02 CashTokens (v2.2.1)](https://github.com/bitjson/cashtokens/) +- [CHIP-2022-05 P2SH32 (v1.5.1)](https://gitlab.com/0353F40E/p2sh32/-/blob/main/CHIP-2022-05_Pay-to-Script-Hash-32_(P2SH32)_for_Bitcoin_Cash.md) + +Anyone may propose an improvement to Bitcoin Cash, but the responsibility is on the CHIP owner to see the idea through to fruition and build consensus. + +## Bitcoin Script + +- 🔵 [Cashscript](https://cashscript.org/docs/basics/about/) [[code]](https://github.com/Bitcoin-com/cashscript) [[playground]](https://playground.cashscript.org/) - a solidity-style language that compiles to Bitcoin Cash Script. +- 🔵 [bitauth ide](https://ide.bitauth.com/) [[code]](https://github.com/bitauth/bitauth-ide) [[video intro]](https://www.youtube.com/watch?v=o-igo-adS8E) - an integrated development environment for bitcoin authentication. +- [AutoCove](https://github.com/TinosNitso/AutoCove-Plugin) - Electrum-cash script decoder. +- [Cashscript VSCode plugin](https://marketplace.visualstudio.com/items?itemName=nathanielcherian.cashscript) [[code]](https://github.com/nathanielCherian/vscode-cashscript) - Visual Studio Code extension for cashscript. + +# Software + +## Full Nodes + +- 🔵 [BCHN](https://bitcoincashnode.org/) [[code]](https://gitlab.com/bitcoin-cash-node/bitcoin-cash-node) [[docs]](https://docs.bitcoincashnode.org/) - a descendant of the Bitcoin Core and Bitcoin ABC software projects with independent development team. C/C++. +- 🔵 [BitcoinUnlimited](https://www.bitcoinunlimited.info/) [[code]](https://github.com/BitcoinUnlimited/BitcoinUnlimited) - a full node implentation focused on supporting user needs, C/C++. + - [Bitcoin Unlimited Improvement Proposals (BUIPS)](https://www.bitcoinunlimited.info/voting/) +- 🔵 [Flowee the Hub](https://flowee.org/) [[code]](https://codeberg.org/Flowee/thehub) - a node supporting a suite of software focused on payment integration. C++ +- 🔵 [Bitcoin Verde](https://bitcoinverde.org/) [[code]](https://github.com/softwareverde/bitcoin-verde) [[docs]](https://explorer.bitcoinverde.org/documentation/) - java implementation with the goal of being interoperable with mining nodes. +- 🔵 [Knuth](https://kth.cash/) [[code]](https://github.com/k-nuth/kth) - a high performance implementation of the Bitcoin protocol focused on applications needing extra capacity and resilience. +- [bchd](https://bchd.cash/) [[code]](https://github.com/gcash/bchd) [[docs]](https://github.com/gcash/bchd/tree/master/docs) - [DEPRECATED] alternative implementation written in Go (golang) + +### Developer Resources + +- [Bitcoin Cash Research](https://bitcoincashresearch.org/) - Site dedicated to technical research on Bitcoin Cash. + +## Open-Source Teams Building on Bitcoin Cash + +> If you want to go fast, go alone. If you want to go far, go together. +> +> -- An African Proverb. + +There are various groups developing software stacks & apps for the broader ecosystem. + +- [General Protocols](https://GeneralProtocols.com) [[repos]](https://gitlab.com/GeneralProtocols) - Team researching and developing protocols for non-custodial and trustless networks using BitBox. (Typescript and Javascript) +- [Electron Cash](https://electroncash.org/) [[repos]](https://github.com/Electron-Cash/) - Team maintaining a desktop SPV wallet with plugins and mobile app (Python) +- [Flowee](https://flowee.org) [[repos]](https://codeberg.org/Flowee) - Team maintaining a non-mining full node and services to access the Bitcoin Cash network. (C++, NodeJs et al) +- [FullStack Cash](https://fullstack.cash/) [[repos]](https://github.com/Permissionless-Software-Foundation) - Team building web/ipfs apps based on BitBox compatible stack. (Javascript) +- [Mainnet Cash](https://mainnet.cash/) [[repos]](https://github.com/mainnet-cash/) - Loose-knit team maintaining a shared server-side and client-side library. + +## Simple Payment Verification (SPV) + +- 🔵 [Fulcrum](https://fulcrumserver.org) [[repos]](https://github.com/cculianu/Fulcrum/) - A fast & nimble SPV Server for Bitcoin Cash. +- 🔵 [Rostrum](https://gitlab.com/bitcoinunlimited/rostrum) - Rostrum is an efficient implementation of Electrum Server written in Rust. + +## Libraries & SDKs + +- [Developer tools](https://bch.info/en/developers) - Page devoted to high level developer tools. +- [Mainnet Cash List](https://mainnet.cash/for-developers.html) - A list of useful services for developers. + +### Language Agnostic + +- 🔵 [mainnet](https://mainnet.cash/) [[tutorial]](https://mainnet.cash/tutorial/) [[rest spec]](https://rest-unstable.mainnet.cash/api-docs/#/) - Typescript library, also available via rest api, or [python](https://github.com/mainnet-cash/mainnet-python-generated), [golang](https://github.com/mainnet-cash/mainnet-go-generated), [php](https://github.com/mainnet-cash/mainnet-php-generated) clients, [et. al](https://mainnet.cash/tutorial/other-languages.html) +- [Insomnia](https://insomnia.fountainhead.cash/) [[code]](https://github.com/fountainhead-cash/insomnia) - Swagger/OpenAPI3 specification for ElectrumX +- [BitBox OpenAPI 3 (Swagger) spec](https://github.com/Bitcoin-com/rest.bitcoin.com/tree/master/swaggerJSONFiles) - for rest.bitcoin.com see: [openapi-generator](https://github.com/OpenAPITools/openapi-generator) + +### Typescript + +- 🔵 [Libauth](https://libauth.org/) [[code]](https://github.com/bitauth/libauth) - an ultra-lightweight, zero-dependency library for Bitcoin Cash and Bitauth applications. (Formerly `bitcoin-ts`.) +- 🔵 [electrum-cash](https://gitlab.com/electrum-cash) [[docs]](https://electrum-cash.gitlab.io/network/) [[tutorials]](https://read.cash/search?q=electrum-cash) - JavaScript library that lets you connect with one or more Electrum servers. +- [flowee-js](https://flowee.org/floweejs/) [[docs]](https://flowee.org/docs/) [[code]](https://codeberg.org/Flowee/js) - Bindings for using Flowee applications and libraries with the NodeJS JavaScript engine. +- 🔵 [mainnet-js](https://mainnet.cash/) [[code]](https://github.com/mainnet-cash/mainnet-js) - Typescript library, also available over rest. +- [``](https://github.com/bitjson/qr-code) [[demo]](https://qr.bitjson.com/) – A no-framework, no-dependencies, customizable, animate-able, SVG-based `` HTML element. + +### Javascript + +- [bch-js](https://github.com/Permissionless-Software-Foundation/bch-js) [[docs]](https://bchjs.fullstack.cash/) - JavaScript library for creating web and mobile apps that can interact with the Bitcoin Cash (BCH) and eCash (XEC) blockchains +- [electrum-cli](https://github.com/rkalis/electrum-cli) - Super simple command line electrum client. +- [bitcore-lib-cash](https://github.com/bitpay/bitcore/tree/master/packages/bitcore-lib-cash) - javaScript library, maintained by bitpay. + +### Python + +- 🔵 [bitcash](https://pybitcash.github.io/bitcash/) [[code]](https://github.com/pybitcash/bitcash) [[docs]](https://bitcash.dev) - python3 library. +- [jtoomim/p2pool](https://github.com/jtoomim/p2pool) - jtoomim fork of bitcoin pool mining software. + +### Rust + +- 🔵 [rust-bitcoincash](https://gitlab.com/rust-bitcoincash/rust-bitcoincash/) - Rust Bitcoin Cash library. + +### Java + +- [bitcoincashj](https://github.com/pokkst/bitcoincashj) - Bitcoin Cash library for Java + +### C + +- [Breadwallet Core](https://github.com/breadwallet/breadwallet-core) - SPV bitcoin C library. + +### PHP + +- [cashp](https://github.com/Ekliptor/cashp) - Library for BCH. + +### R + +- [rbch](https://cran.r-project.org/package=rbch) - Extraction and Statistical Analysis of Data from the BCH Blockchain + +# Endorsements + +Below is a list of endorsements made in the [Chip Process](#chip-process) in reverse chronological order. + +## The [Adaptive Blocksize Limit Algorithm (ebaa) CHIP](https://gitlab.com/0353F40E/ebaa) for the May 2024 BCH Upgrade is AWESOME! + +[a42f44791b343ffcc118b0dd6645972e9a165e83](https://gitlab.com/0353F40E/ebaa/-/commit/a42f44791b343ffcc118b0dd6645972e9a165e83) + + +## The [CashTokens](https://bitcoincashresearch.org/t/chip-2022-02-cashtokens-token-primitives-for-bitcoin-cash/725) and [P2SH32 CHIP](https://bitcoincashresearch.org/t/chip-2022-05-pay-to-script-hash-32-p2sh32-for-bitcoin-cash/806) Proposals for the May 2023 BCH Upgrade are AWESOME! + +[539b2a492002da881a9ef9aa6604327299c7a498](https://github.com/bitjson/cashtokens/commit/539b2a492002da881a9ef9aa6604327299c7a498) + + + +# The Archive + +Due to the nature of bitcoin, some stuff is forever... + +- [chaintip](https://www.chaintip.org) - An on-chain non-custodial tipping bot for reddit/twitter & github. [DEPRECATED due to reddit API access changes] + +## Bitcoin Script tools + +- [spedn](https://spedn.pl/) [[code]](https://bitbucket.org/o-studio/spedn/src/develop/) [[docs]](https://spedn.readthedocs.io/en/latest/) - a high level smart contract language that compiles to Bitcoin Cash Script. +- [meep](https://github.com/gcash/meep) - a command line Bitcoin Cash script debugger. + +## Simple Ledger Protocol (SLP Token) + +The Permissionless Software Foundation is actively maintaining an SLP wallet and indexer, denoted with starts (⭐) below. + +### Protocols + +- Simple Ledger Protocol (SLP) [[specs]](https://slp.dev) - for handling ERC-20 style tokens. +- [Simple Ledger Postage Protocol](https://github.com/simpleledger/slp-specifications/blob/master/slp-postage-protocol.md) - Protocol for sending SLP tokens without BCH "gas". + +### Libraries + +- **⭐ SLP Indexer ⭐** [[code]](https://github.com/Permissionless-Software-Foundation/psf-slp-indexer) - Functional SLP token indexer running token infrastructure for several businesses. +- Simple Ledger [[repos]](https://github.com/simpleledger) - Group leading SLP token integration. (Typescript & Python) +- [SLP Explorer](https://simpleledger.info/) [[code]](https://github.com/salemkode/slp-explorer) [[backend src]](https://github.com/salemkode/slp-explorer-backend) - Slp explorer for bitcoin cash. +- SLPDB [[code]](https://github.com/simpleledger/SLPDB) [[doc]](https://slp.dev/tooling/slpdb/) - simpleledger indexer +- [gs++](https://gs.fountainhead.cash/) [[code]](https://github.com/blockparty-sh/cpp_slp_graph_search) [[doc]](https://gs.fountainhead.cash/swagger.html) - a fast SLP indexer, validator, and graph search server. +- [SLP Stream](https://slpstream.fountainhead.cash/channel) [[code]](https://github.com/blockparty-sh/slpstream) [[doc]](https://slp.dev/tooling/slpstream/) - a frontend API for GS++ that provides a streaming output of new transactions. +- [goslp](https://github.com/simpleledgerinc/goslp) - SLP go libraries. +- [SLP Indexer](https://github.com/Bitcoin-com/slp-indexer) - bitcoin.com indexer. +- [SLP Icons](https://github.com/kosinusbch/slp-token-icons) - Hosted icons for slp tokens. + +## SLP Token Projects + +- **⭐ [PSF wallet](https://wallet.fullstack.cash/) ⭐** [[code]](https://github.com/Permissionless-Software-Foundation/gatsby-ipfs-web-wallet) - An web wallet with SLP support. +- [SLP Explorer](https://simpleledger.info/) [[code]](https://github.com/salemkode/slp-explorer) [[backend src]](https://github.com/salemkode/slp-explorer-backend) - Open source explorer for SLP tokens. +- Electron-Cash SLP Edition [[code]](https://github.com/simpleledger/Electron-Cash-SLP) [[releases]](https://github.com/simpleledger/Electron-Cash-SLP/releases) +- Honk Token [[archive]](https://web.archive.org/web/20230921212507/https://honk.cash/) [[whitepaper]](https://web.archive.org/web/20220409174235/https://www.honk.cash/whitepaper.pdf) - A gambling/gaming/multipurpose SLP token. +- mistcoin [[archive]](http://web.archive.org/web/20210128134553/https://mistcoin.org/) [[blue miner]](https://gitlab.com/blue_mist/miner) - A mineable SLP token using a proof-of-work covenant contract +- SpiceToken [[archive]](https://web.archive.org/web/20230216030610/https://spicetoken.org/) - A meme SLP token for social tipping. + + +tons of data from awesome monero: + +# Awesome Monero List + +A curated list of awesome Monero libraries, tools, and resources. + +## Contents + +- [Resources](#resources) +- [Wallets](#wallets) +- [Libraries](#libraries) +- [Docker](#docker) +- [Tools](#tools) +- [Nodes](#nodes) +- [Blockchain Explorers](#blockchain-explorers) +- [Built with Monero](#build-with-monero) +- [Mining](#mining) +- [Decentralized Exchanges](#decentralized-exchanges) +- [Atomic Swaps](#atomic-swaps) +- [Integrations](#integrations) +- [Merchants](#merchants) +- [Point of Sale](#point-of-sale) +- [Future development](#future-development) +- [Other](#other) + +## Resources + +- [Official Website](https://getmonero.org/) +- [Official GitHub](https://github.com/monero-project/monero) +- [Official Twitter](https://twitter.com/monero) +- [Official Reddit](https://www.reddit.com/r/Monero/) +- [Unofficial Docs](https://docs.monero.study/) +- [Monero Research Lab](https://github.com/monero-project/research-lab) + +- [Implementing Seraphis](https://raw.githubusercontent.com/UkoeHB/Seraphis/master/implementing_seraphis/Impl-Seraphis-0-0-2.pdf) +- [RandomX](https://github.com/tevador/RandomX) - RandomX is a proof-of-work (PoW) algorithm that is optimized for general-purpose CPUs. +- [LMDB](https://github.com/LMDB/lmdb) - Lightning Memory-Mapped Database + +### Books + +- [Mastering Monero](https://github.com/monerobook/monerobook) - "Mastering Monero: The future of private transactions" is your guide through the world of Monero, a leading cryptocurrency with a focus on private and censorship-resistant transactions. This book contains everything you need to know to start using Monero in your business or day-to-day life, even if you've never understood or interacted with cryptocurrencies before. +- [monero-book](https://github.com/Cuprate/monero-book) - This book aims to document the Monero protocol. Currently, work is being done to document Monero's consensus rules. This being completed as a part of [Cuprate](https://github.com/Cuprate/cuprate), the Rust Monero node. ([Website](https://monero-book.cuprate.org/)) + +## Wallets + +### Desktop Wallets + +- [Monero GUI Wallet](https://getmonero.org/downloads/) - Official desktop wallet +- [Feather Wallet](https://github.com/feather-wallet/feather) ([Website](https://featherwallet.org/)) - Lightweight desktop wallet +- [monero-wallet-generator](https://github.com/moneromooo-monero/monero-wallet-generator) - Self contained offline javacsript Monero wallet generator +- [Cake Wallet](https://github.com/cake-tech/cake_wallet) - Popular iOS and Android wallet and desktop wallet + +### Mobile Wallets + +- [Cake Wallet](https://github.com/cake-tech/cake_wallet) - Popular iOS and Android wallet and desktop wallet +- [Monerujo](https://github.com/m2049r/xmrwallet) - Popular Android wallet +- [Stack Wallet](https://github.com/cypherstack/stack_wallet) - A multicoin, cryptocurrency wallet +- [ANONERO](http://anonero.io/) - Hardened wallet with enforced privacy & security for Android (onion link) +- [MYSU](http://rk63tc3isr7so7ubl6q7kdxzzws7a7t6s467lbtw2ru3cwy6zu6w4jad.onion/) - A no-bullshit, pure Monero wallet suitable for both newcomers and experienced users. For Android. (onion link) + +### Hardware Wallets + +- [Kastelo](https://github.com/monero-project/kastelo) - This is the project to create an official Monero Hardware Wallet (Dead project) +- [passport2-monero](https://github.com/mjg-foundation/passport2-monero) - v2.x.x series of firmware for Passport, rebuilt for monero +- [MoneroSigner](https://github.com/Monero-HackerIndustrial/MoneroSigner) - Seedsigner Monero fork. Use an air-gapped Raspberry Pi Zero to sign monero transactions! +- [Monero Ledger App](https://github.com/LedgerHQ/app-monero) - Monero wallet application for Ledger Nano S and Nano X. (avoid buying Ledger products) + +### Other Wallets +- [Monero Subscriptions Wallet](https://github.com/lukeprofits/Monero_Subscriptions_Wallet) - A Monero wallet that automatically pays subscriptions. + +## Libraries + +- [monero-ts](https://github.com/woodser/monero-ts) - Monero TypeScript library for Node.js and browsers +- [monerophp](https://github.com/monero-integrations/monerophp) - A Monero library written in PHP by the Monero Integrations team. +- [monero-python](https://github.com/monero-integrations/monero-python) - A comprehensive Python module for handling Monero cryptocurrency +- [monero-rpc-php](https://github.com/refring/monero-rpc-php) - Monero daemon and wallet RPC client library written in modern PHP. +- [monero-java](https://github.com/woodser/monero-java) - Java library for using Monero +- [monero-rs](https://github.com/monero-rs/monero-rs) - Library with support for de/serialization on block data structures and key/address generation and scanning related to Monero cryptocurrency. +- [libmonero](https://github.com/monumexyz/libmonero) - libmonero is a library for the Monero cryptocurrency written in Rust. It is designed to be fast, safe and easy to use. +- [monero-cpp](https://github.com/woodser/monero-cpp) - C++ library for using Monero +- [go-monero-rpc-client](https://github.com/omani/go-monero-rpc-client) - A go client for the Monero wallet and daemon RPC +- [go-monero](https://github.com/duggavo/go-monero) - A multi-platform Go library for interacting with Monero servers either on clearnet or not, supporting daemon and wallet RPC, p2p commands and ZeroMQ. + +## Docker + +- [Simple Monerod Docker](https://github.com/sethforprivacy/simple-monerod-docker) - A simple docker image for running a Monero node. +- [Monero Suite](https://github.com/hundehausen/monero-suite) ([Website](https://monerosuite.org)) - Build your personal docker-compose.yml file for Monero services. +- [Docker-XMRig](https://github.com/metal3d/docker-xmrig) - Xmrig containeried to mine monero cryptocurrency +- [Moneroblock Docker](https://github.com/sethforprivacy/moneroblock-docker) - A simple and straightforward Dockerized MoneroBlock built from source and exposing standard ports. + +## Tools + +- [Monero Inflation Checker](https://github.com/DangerousFreedom1984/monero_inflation_checker) - Minimal Python tools and educational material for checking inflation in Monero. You can get more information at moneroinflation.com. +- [Monero Vanity Address Generator](https://github.com/hinto-janai/monero-vanity) - Monero vanity address generator for CPUs +- [monero-lws](https://github.com/vtnerd/monero-lws) - Monero Light Wallet Server (scans monero viewkeys and implements mymonero API) + +## Nodes + +- [Monero Node List](https://moneroworld.com/) - A list of public Monero nodes. +- [Monero Node Scanner](https://monerohash.com/nodes-distribution.html) - A tool to scan the Monero network for nodes. +- [monero.fail](https://monero.fail/) - Monero public node aggregator. +- [Monerod-in-Termux](https://github.com/CryptoGrampy/android-termux-monero-node) - Run a Monero Node on Android using Termux +- [check-monero-seed-nodes](https://github.com/plowsof/check-monero-seed-nodes) - A script to check the status of Monero seed nodes +- [Monero Node for Umbrel](https://github.com/deverickapollo/umbrel-monero) - Run a Monero node on your Umbrel personal server. +- [xmr.sh](https://github.com/vdo/xmr.sh) - xmr.sh script wizard sets up a new server running a monero node daemon with Docker compose, with your choice of SSL certificates for your domain, network selection, a Tor hidden service, Grafana dashboard and more. +- [Monero Nodo](https://github.com/MoneroNodo/Nodo) - Software running on a [Monero Nodo](https://moneronodo.com/): Monero Full Node on powerful hardware + +## Blockchain Explorers + +- [Onion Monero Blockchain Explorer](https://github.com/moneroexamples/onion-monero-blockchain-explorer) - A Monero blockchain explorer. +- [Moneroblock](https://github.com/duggavo/MoneroBlock) - Decentralized and trustless Monero block explorer + +## Built with Monero + +- [Nerostr](https://github.com/pluja/nerostr) - nostr paid relay, but with monero +- [NEVEKO](https://github.com/creating2morrow/neveko) - full-stack privacy application with gpg messaging, monero multisig and built-in i2p marketplace +- [Split My Lunch](https://github.com/AlexAnarcho/split-my-lunch) - Allow co-workers to split the lunch bill in Monero +- [XMR-T3-starter](https://gitlab.com/monero-studio/xmr-t3-starter) - A starter template for a T3 web app with monero-ts. t3-stack: nextjs (react), typescript, tailwind, trpc, prisma also includes: shadcn/ui, monero-ts + +## Mining + +- [XMRig](https://github.com/xmrig/xmrig) - High performance, open source, cross platform RandomX, CryptoNight and Argon2 CPU/GPU miner +- [Gupax](https://github.com/hinto-janai/gupax) - A simple GUI for mining Monero on P2Pool, using XMRig. +- [P2Pool](https://github.com/SChernykh/p2pool) - P2Pool is a decentralized Monero mining pool that works by creating a peer-to-peer network of miner nodes. +- [XMRig Proxy](https://github.com/xmrig/xmrig-proxy) - Stratum proxy with Web interface, support for several backup pools, and more. +- [Docker-XMRig](https://github.com/metal3d/docker-xmrig) - Xmrig containeried to mine monero cryptocurrency +- [MoneroOS](https://github.com/4rkal/MoneroOS) - Plug and play monero mining archuseriso config +- [XMRig for Android](https://github.com/XMRig-for-Android/xmrig-for-android) - ⛏ Mine Monero from your android device + +## Decentralized Exchanges + +- [Bisq](https://github.com/bisq-network/bisq) ([Website](https://bisq.network/)) - A decentralized exchange network for trading Monero and other cryptocurrencies. +- [Haveno](https://github.com/haveno-dex/haveno) - A decentralized, peer-to-peer, non-custodial Monero exchange for trading fiat currencies for Monero. +- [Serai](https://github.com/serai-dex/serai) - Serai is a new DEX, built from the ground up, initially planning on listing Bitcoin, Ethereum, DAI, and Monero, offering a liquidity-pool-based trading experience. Funds are stored in an economically secured threshold-multisig wallet. +- [BasicSwapDex](https://github.com/tecnovert/basicswap) ([Website](https://basicswapdex.com/)) - The BasicSwap DEX is a privacy-first and decentralized exchange which features cross-chain atomic swaps and a distributed order book. + +## Atomic Swaps + +- [XMR to BTC Atomic Swap](https://github.com/comit-network/xmr-btc-swap) - Bitcoin–Monero Cross-chain Atomic Swap +- [ETH-XMR Atomic Swaps](https://github.com/AthanorLabs/atomic-swap) - 💫 ETH-XMR atomic swap implementation +- [UnstoppableSwap GUI](https://github.com/UnstoppableSwap/unstoppableswap-gui) - Graphical User Interface (GUI) For Trustless Cross-Chain XMR<>BTC Atomic Swaps +- [BCH-XMR-SWAP PoC](https://github.com/PHCitizen/bch-xmr-swap) - A proof of concept for a Bitcoin Cash to Monero atomic swap +- [Farcaster Project](https://github.com/farcaster-project) - Farcaster is a cross-chain atomic swap protocol and implementation who allows to exchange Bitcoin and Monero in a peer-to-peer manner with anyone running a Farcaster node. +- [Samourai XMR-BTC Swap Beta](https://code.samourai.io/wallet/comit-swaps-java) - A GUI for COMIT XMR-BTC atomic swaps with modifications to further enhance anonymity, with the Automated Swap Backend (ASB) built-in, as well as Samourai Wallet Whirlpool for automatic mixing of redeemed BTC. (Beta!) + + +## Merchants + +- [Monero Merchants](https://www.monerooutreach.org/stories/monero_merchants.html) - A list of merchants that accept Monero as payment. +- [Monerica](https://github.com/monerica-project/monerica) ([Website](https://monerica.com/)) - A directory for a Monero circular economy +- [Monero for Merchants](https://github.com/ASchmidt1024/monero-for-merchants-booklet) - A printable booklet to attract merchants to accept Monero (multiple languages!) + +## Point of Sale + +- [Kasisto](https://github.com/amiuhle/kasisto) - A Monero Point of Sale payment system +- [Monero Gateway for WooCommerce](https://github.com/monero-integrations/monerowp) - A Monero WooCommerce Plugin for Wordpress +- [MoneroPay](https://github.com/moneropay/moneropay) - A Monero payment gateway for WooCommerce +- [Monero Merchant](https://github.com/RuiSiang/monero-merchant) - Monero Merchant is a RESTful API wrapper for the official Monero wallet RPC. This project is mainly for merchants who hope to accept Monero as payment. +- [AcceptXMR](https://github.com/busyboredom/acceptxmr) - This library aims to provide a simple, reliable, and efficient means to track monero payments. +- [HotShop](https://github.com/CryptoGrampy/HotShop) - An Ephemeral, browser-based, no-private-key, no-server Point of Sale for receiving and validating Monero payments. Repository is archived :( +- [monerochan-merchant-rpc](https://github.com/spirobel/monerochan-merchant-rpc) - A tool to accept digital cash at your online business. + +## Future development + +- [Seraphis](https://github.com/UkoeHB/Seraphis) - Seraphis is a privacy-focused transaction protocol for p2p electronic cash systems (e.g. cryptocurrencies). +- [Full chain membership proofs](https://github.com/kayabaNerve/full-chain-membership-proofs) +- [Cuprate](https://github.com/Cuprate/cuprate) - an upcoming experimental, modern & secure monero node. Written in Rust. +- [wallet3](https://github.com/seraphis-migration/wallet3) - Info and discussions about a hypothetical full 'wallet2' rewrite from scratch From e335cc44d69fdee82879a83db51a52c851f95333 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 28 Apr 2024 21:10:34 +0330 Subject: [PATCH 50/74] correct formatting --- src/parser/link_url/ip/ip_literal.rs | 12 +- src/parser/link_url/ip/ipv4.rs | 2 +- src/parser/link_url/ip/ipv6.rs | 8 +- src/parser/link_url/ip/ipvfuture.rs | 20 ++-- src/parser/link_url/ip/mod.rs | 6 +- src/parser/link_url/mod.rs | 56 ++++------ src/parser/link_url/parse_link.rs | 103 +++++++++--------- src/parser/mod.rs | 2 +- .../hashtag_content_char_ranges.rs | 2 +- .../parse_from_text/markdown_elements.rs | 12 +- src/parser/parse_from_text/text_elements.rs | 10 +- src/parser/utils.rs | 2 - tests/text_to_ast/desktop_set.rs | 36 ++---- tests/text_to_ast/markdown.rs | 86 +++++---------- tests/text_to_ast/mod.rs | 6 +- 15 files changed, 146 insertions(+), 217 deletions(-) diff --git a/src/parser/link_url/ip/ip_literal.rs b/src/parser/link_url/ip/ip_literal.rs index 1c8e472..0efaf7b 100644 --- a/src/parser/link_url/ip/ip_literal.rs +++ b/src/parser/link_url/ip/ip_literal.rs @@ -1,20 +1,12 @@ use nom::{ - branch::alt, - character::complete::char, - combinator::recognize, - sequence::tuple, - IResult, + branch::alt, character::complete::char, combinator::recognize, sequence::tuple, IResult, }; use crate::parser::{ + link_url::ip::{ipv6::ipv6, ipvfuture::ipvfuture}, parse_from_text::base_parsers::CustomError, - link_url::ip::{ - ipvfuture::ipvfuture, - ipv6::ipv6, - }, }; - pub fn ip_literal(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((char('['), alt((ipv6, ipvfuture)), char(']'))))(input) } diff --git a/src/parser/link_url/ip/ipv4.rs b/src/parser/link_url/ip/ipv4.rs index c082e56..0012556 100644 --- a/src/parser/link_url/ip/ipv4.rs +++ b/src/parser/link_url/ip/ipv4.rs @@ -1,5 +1,5 @@ use nom::{ - character::complete::{u8, char}, + character::complete::{char, u8}, combinator::recognize, sequence::tuple, IResult, diff --git a/src/parser/link_url/ip/ipv6.rs b/src/parser/link_url/ip/ipv6.rs index d3f0546..340b592 100644 --- a/src/parser/link_url/ip/ipv6.rs +++ b/src/parser/link_url/ip/ipv6.rs @@ -8,10 +8,7 @@ use nom::{ IResult, }; -use crate::parser::{ - parse_from_text::base_parsers::CustomError, - utils::is_hex_digit, -}; +use crate::parser::{parse_from_text::base_parsers::CustomError, utils::is_hex_digit}; use super::ipv4::ipv4; @@ -36,7 +33,6 @@ fn h16_and_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((h16, char(':'))))(input) } - fn double_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { tag("::")(input) } @@ -62,7 +58,7 @@ pub fn ipv6(input: &str) -> IResult<&str, &str, CustomError<&str>> { count(h16_and_period, 3), ls32, ))), - // [<0 to 2 h16_and_period> ] :: <2*h16_and_period> + // [<0 to 2 h16_and_period> ] :: <2*h16_and_period> recognize(tuple(( opt(tuple((many_m_n(0, 2, h16_and_period), h16))), double_period, diff --git a/src/parser/link_url/ip/ipvfuture.rs b/src/parser/link_url/ip/ipvfuture.rs index 923e5f6..78a68e1 100644 --- a/src/parser/link_url/ip/ipvfuture.rs +++ b/src/parser/link_url/ip/ipvfuture.rs @@ -1,18 +1,11 @@ use nom::{ - bytes::complete::take_while_m_n, - character::complete::char, - combinator::recognize, - sequence::tuple, - IResult, + bytes::complete::take_while_m_n, character::complete::char, combinator::recognize, + sequence::tuple, IResult, }; use crate::parser::{ - utils::{ - is_hex_digit, - is_sub_delim, - is_unreserved, - }, parse_from_text::base_parsers::CustomError, + utils::{is_hex_digit, is_sub_delim, is_unreserved}, }; fn is_ipvfuture_last(ch: char) -> bool { @@ -20,5 +13,10 @@ fn is_ipvfuture_last(ch: char) -> bool { } pub fn ipvfuture(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(tuple((char('v'), take_while_m_n(1, 1, is_hex_digit), char('.'), take_while_m_n(1, 1, is_ipvfuture_last))))(input) + recognize(tuple(( + char('v'), + take_while_m_n(1, 1, is_hex_digit), + char('.'), + take_while_m_n(1, 1, is_ipvfuture_last), + )))(input) } diff --git a/src/parser/link_url/ip/mod.rs b/src/parser/link_url/ip/mod.rs index 75bbe3d..8ba551d 100644 --- a/src/parser/link_url/ip/mod.rs +++ b/src/parser/link_url/ip/mod.rs @@ -1,4 +1,4 @@ -mod ipvfuture; -mod ipv6; -pub(crate) mod ipv4; pub(crate) mod ip_literal; +pub(crate) mod ipv4; +mod ipv6; +mod ipvfuture; diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 1bb4aa1..a1ae197 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -1,17 +1,12 @@ -mod parse_link; mod ip; +mod parse_link; use nom::{ - Slice, - IResult, - error::{ParseError, ErrorKind}, -}; - -use crate::parser::{ - parse_from_text::base_parsers::CustomError, - link_url::parse_link::parse_link, + error::{ErrorKind, ParseError}, + IResult, Slice, }; +use crate::parser::{link_url::parse_link::parse_link, parse_from_text::base_parsers::CustomError}; ///! Parsing / Validation of URLs /// @@ -44,44 +39,40 @@ pub struct PunycodeWarning { pub punycode_encoded_url: String, } - impl LinkDestination<'_> { /// parse a link that is not in a delimited link or a labled link, just a part of normal text /// it has a whitelist of schemes, because otherwise /* - pub(crate) fn parse_standalone_with_whitelist( - input: &str, - ) -> IResult<&str, LinkDestination, CustomError<&str>> { - if let Ok((rest, link_destination)) = parse_link(input) { - if link_destination.hostname.is_none() { - // if it's a generic url like geo:-15.5,41.1 - if !is_allowed_generic_scheme(link_destination.scheme) { - Err(nom::Err::Error(CustomError::InvalidLink)) + pub(crate) fn parse_standalone_with_whitelist( + input: &str, + ) -> IResult<&str, LinkDestination, CustomError<&str>> { + if let Ok((rest, link_destination)) = parse_link(input) { + if link_destination.hostname.is_none() { + // if it's a generic url like geo:-15.5,41.1 + if !is_allowed_generic_scheme(link_destination.scheme) { + Err(nom::Err::Error(CustomError::InvalidLink)) + } else { + Ok((rest, link_destination)) + } } else { - Ok((rest, link_destination)) + Ok(( + rest, + link_destination + )) } } else { - Ok(( - rest, - link_destination - )) + Err(nom::Err::Error(CustomError::InvalidLink)) } - } else { - Err(nom::Err::Error(CustomError::InvalidLink)) } - } -*/ + */ pub(crate) fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { if let Ok((rest, link_destination)) = parse_link(input) { - Ok(( - rest, - link_destination - )) + Ok((rest, link_destination)) } else { Err(nom::Err::Error(CustomError::InvalidLink)) } } - + pub(crate) fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (mut remaining, mut link) = Self::parse(input)?; if let Some(first) = remaining.chars().next() { @@ -95,7 +86,6 @@ impl LinkDestination<'_> { } } - #[derive(Debug, PartialEq, Eq)] pub enum LinkParseError { Nom(I, ErrorKind), diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 10b129e..5eb16b1 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -2,26 +2,24 @@ use std::ops::RangeInclusive; use nom::{ branch::alt, - Slice, bytes::complete::{tag, take_while, take_while1, take_while_m_n}, character::complete::char, combinator::{opt, recognize}, multi::{many0, many1}, sequence::tuple, - IResult, + IResult, Slice, }; use crate::parser::{ - parse_from_text::base_parsers::CustomError, link_url::{ - PunycodeWarning, - LinkDestination, - ip::{ - ipv4::ipv4, - ip_literal::ip_literal, - }, + ip::{ip_literal::ip_literal, ipv4::ipv4}, + LinkDestination, PunycodeWarning, + }, + parse_from_text::base_parsers::CustomError, + utils::{ + is_alpha, is_digit, is_hex_digit, is_in_one_of_ranges, is_not_white_space, is_sub_delim, + is_unreserved, }, - utils::{is_not_white_space, is_alpha, is_hex_digit, is_digit, is_in_one_of_ranges, is_sub_delim, is_unreserved}, }; /// determines which generic schemes (without '://') get linkifyed @@ -43,7 +41,6 @@ fn is_allowed_generic_scheme(scheme: &str) -> bool { ) } - // These ranges have been extracted from RFC3987, Page 8. const UCSCHAR_RANGES: [RangeInclusive; 17] = [ 0xa0..=0xd7ff, @@ -73,7 +70,6 @@ fn is_iunreserved(c: char) -> bool { is_unreserved(c) || is_ucschar(c) } - // Here again, order is important. As URLs/IRIs have letters in them // most of the time and less digits or other characters. --Farooq fn is_scheme(c: char) -> bool { @@ -122,7 +118,6 @@ fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> { ))))(input) } - /// Parse the iauthority block /// # Description /// An iauthority is... @@ -186,7 +181,6 @@ fn is_iquery_not_pct_encoded(c: char) -> bool { is_iprivate(c) || is_ipchar_not_pct_encoded(c) || matches!(c, '/' | '?') } - /// Consume an iquery block fn iquery(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(alt(( @@ -196,18 +190,13 @@ fn iquery(input: &str) -> IResult<&str, &str, CustomError<&str>> { } fn take_while_ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(many0(alt(( - take_while_ipchar1, - tag("/"), - tag("?"), - ))))(input) + recognize(many0(alt((take_while_ipchar1, tag("/"), tag("?")))))(input) } - /// Consume scheme characters from input /// /// # Description -/// This function as it can be seen, consumes exactly an alpha and as many +/// This function as it can be seen, consumes exactly an alpha and as many /// scheme characters as there are. then it gets a slice of input(as cloned to i) /// /// # Arguments @@ -229,13 +218,14 @@ fn scheme(input: &str) -> IResult<&str, &str, CustomError<&str>> { } } - /// Take as many pct encoded blocks as there are. a block is %XX where X is a hex digit fn take_while_pct_encoded(input: &str) -> IResult<&str, &str, CustomError<&str>> { - recognize(many1(tuple((char('%'), take_while_m_n(2, 2, is_hex_digit)))))(input) + recognize(many1(tuple(( + char('%'), + take_while_m_n(2, 2, is_hex_digit), + ))))(input) } - /// encode a host to punycode encoded string fn punycode_encode(host: &str) -> String { host.split('.') @@ -254,7 +244,6 @@ fn punycode_encode(host: &str) -> String { .join(".") } - /// Returns true if host string contains non ASCII characters fn is_puny(host: &str) -> bool { for ch in host.chars() { @@ -311,15 +300,22 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let (_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; let query = query.unwrap_or(""); // in the case of no iquery let fragment = fragment.unwrap_or(""); // in the case of no ifragment - let ihier_len = 3usize.saturating_add(authority.len()).saturating_add(host.len()).saturating_add(path.len()); + let ihier_len = 3usize + .saturating_add(authority.len()) + .saturating_add(host.len()) + .saturating_add(path.len()); // compute length of authority + host + path - let mut len = scheme.len().saturating_add(ihier_len).saturating_add(query.len()).saturating_add(fragment.len()); + let mut len = scheme + .len() + .saturating_add(ihier_len) + .saturating_add(query.len()) + .saturating_add(fragment.len()); // compute length of link which is ihier_len + scheme + query + fragment if let Some(link) = input_.get(0..len) { if link.ends_with([':', ';', '.', ',']) { len -= 1; if path.is_empty() && query.is_empty() && fragment.is_empty() { - host = input_.slice(scheme.len()+3..input_.len()-1); + host = input_.slice(scheme.len() + 3..input_.len() - 1); } } @@ -394,7 +390,6 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { } } - let link = input_.slice(0..len); let input = input_.slice(len..); @@ -432,12 +427,15 @@ fn parse_generic(input: &str) -> IResult<&str, LinkDestination, CustomError<&str let (input, rest) = take_while(is_not_white_space)(input)?; let len = scheme.len().saturating_add(rest.len()); if let Some(target) = i.get(0..len) { - return Ok((input, LinkDestination { - scheme, - target, - hostname: None, - punycode: None, - })); + return Ok(( + input, + LinkDestination { + scheme, + target, + hostname: None, + punycode: None, + }, + )); } Err(nom::Err::Failure(CustomError::NoContent)) } @@ -449,7 +447,10 @@ pub(super) fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomEr #[cfg(test)] mod test { #![allow(clippy::unwrap_used)] - use crate::parser::{LinkDestination, link_url::parse_link::{punycode_encode, PunycodeWarning}}; + use crate::parser::{ + link_url::parse_link::{punycode_encode, PunycodeWarning}, + LinkDestination, + }; #[test] fn basic_parsing() { @@ -475,13 +476,11 @@ mod test { "ftp://test-test", ]; - let test_cases_with_puny = vec![ - "https://ü.app#help", - "http://münchen.de", - ]; + let test_cases_with_puny = vec!["https://ü.app#help", "http://münchen.de"]; for input in &test_cases_no_puny { - let (rest, link_destination) = LinkDestination::parse(input).expect(&format!("Test failed: {input}")); + let (rest, link_destination) = + LinkDestination::parse(input).expect(&format!("Test failed: {input}")); assert_eq!(input, &link_destination.target); assert_eq!(rest.len(), 0); @@ -489,7 +488,8 @@ mod test { } for input in &test_cases_with_puny { - let (rest, link_destination) = LinkDestination::parse(input).expect("Test failed: {input}"); + let (rest, link_destination) = + LinkDestination::parse(input).expect("Test failed: {input}"); assert!(link_destination.punycode.is_some()); assert_eq!(rest.len(), 0); @@ -577,22 +577,23 @@ mod test { punycode: None, target: "mailto:someone@example.com" } - ) ); assert_eq!( LinkDestination::parse("bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka") .unwrap() .1, - LinkDestination { - hostname: None, - scheme: "bitcoin", - target: "bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka", - punycode: None, - } - ); + LinkDestination { + hostname: None, + scheme: "bitcoin", + target: "bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka", + punycode: None, + } + ); assert_eq!( - LinkDestination::parse("geo:37.786971,-122.399677").unwrap().1, + LinkDestination::parse("geo:37.786971,-122.399677") + .unwrap() + .1, LinkDestination { scheme: "geo", punycode: None, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 72ad79a..9066e6c 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,6 +1,6 @@ // mod email; -pub mod parse_from_text; pub mod link_url; +pub mod parse_from_text; mod utils; pub use crate::parser::link_url::{LinkDestination, PunycodeWarning}; diff --git a/src/parser/parse_from_text/hashtag_content_char_ranges.rs b/src/parser/parse_from_text/hashtag_content_char_ranges.rs index 930c105..0d816ec 100644 --- a/src/parser/parse_from_text/hashtag_content_char_ranges.rs +++ b/src/parser/parse_from_text/hashtag_content_char_ranges.rs @@ -882,8 +882,8 @@ pub(crate) fn hashtag_content_char(c: char) -> bool { #[cfg(test)] mod test { - use crate::parser::utils::is_in_one_of_ranges; use crate::parser::parse_from_text::hashtag_content_char_ranges::hashtag_content_char; + use crate::parser::utils::is_in_one_of_ranges; use std::ops::RangeInclusive; #[test] diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index 67e2221..bbbf103 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -6,22 +6,16 @@ use nom::{ IResult, }; +use super::{base_parsers::*, parse_all}; use crate::parser::{ link_url::LinkDestination, parse_from_text::{ - text_elements::{ - email_address, - parse_text_element, - }, base_parsers::direct_delimited, + text_elements::{email_address, parse_text_element}, Element, }, - utils::{ - is_white_space, - is_white_space_but_not_linebreak, - }, + utils::{is_white_space, is_white_space_but_not_linebreak}, }; -use super::{base_parsers::*, parse_all}; fn inline_code(input: &str) -> IResult<&str, &str, CustomError<&str>> { delimited(tag("`"), is_not("`"), tag("`"))(input) diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index 9bbf8ec..aff6d92 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -1,22 +1,20 @@ ///! nom parsers for text elements use nom::{ bytes::{ - complete::{tag, take, take_while1, take_while}, + complete::{tag, take, take_while, take_while1}, streaming::take_till1, }, - character::complete::char, character, + character::complete::char, combinator::{peek, recognize, verify}, sequence::tuple, - AsChar, IResult, Offset, Slice + AsChar, IResult, Offset, Slice, }; - -use crate::parser::link_url::LinkDestination; use super::base_parsers::CustomError; use super::hashtag_content_char_ranges::hashtag_content_char; use super::Element; - +use crate::parser::link_url::LinkDestination; fn linebreak(input: &str) -> IResult<&str, char, CustomError<&str>> { char('\n')(input) diff --git a/src/parser/utils.rs b/src/parser/utils.rs index 8713060..c77ab6b 100644 --- a/src/parser/utils.rs +++ b/src/parser/utils.rs @@ -85,5 +85,3 @@ pub(crate) fn is_not_white_space(c: char) -> bool { pub(crate) fn is_white_space_but_not_linebreak(c: char) -> bool { matches!(c, '\t' | ' ') } - - diff --git a/tests/text_to_ast/desktop_set.rs b/tests/text_to_ast/desktop_set.rs index d6bd311..db11e7b 100644 --- a/tests/text_to_ast/desktop_set.rs +++ b/tests/text_to_ast/desktop_set.rs @@ -207,7 +207,10 @@ fn link() { ), ( "https://delta.chat/en/help?hi=5&e=4#section2.0", - https_link_no_puny("https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat"), + https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat", + ), ), ( "https://delta#section2.0", @@ -235,13 +238,10 @@ fn link() { ), ]; - let test_cases_with_puny = [ - ( - "https://ü.app#help", - https_link_no_puny("https://ü.app#help", "ü.app") - ) - ]; - + let test_cases_with_puny = [( + "https://ü.app#help", + https_link_no_puny("https://ü.app#help", "ü.app"), + )]; for (input, destination) in &test_cases_no_puny { println!("testing {input}"); @@ -257,22 +257,10 @@ fn link() { println!("testing {input}"); match &parse_desktop_set(input)[0] { Link { destination } => { - assert_eq!( - expected_destination.target, - destination.target - ); - assert_eq!( - expected_destination.scheme, - destination.scheme - ); - assert_eq!( - expected_destination.hostname, - destination.hostname, - ); - assert_eq!( - destination.punycode.is_some(), - true - ); + assert_eq!(expected_destination.target, destination.target); + assert_eq!(expected_destination.scheme, destination.scheme); + assert_eq!(expected_destination.hostname, destination.hostname,); + assert_eq!(destination.punycode.is_some(), true); } _ => { panic!(); diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index dd786dc..cdca291 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -528,7 +528,10 @@ fn link() { ), ( "https://delta.chat/en/help?hi=5&e=4#section2.0", - https_link_no_puny("https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat"), + https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat", + ), ), ( "https://delta#section2.0", @@ -556,21 +559,15 @@ fn link() { ), ]; - let test_cases_with_puny = [ - ( - "https://ü.app#help", - https_link_no_puny("https://ü.app#help", "ü.app") - ) - ]; - + let test_cases_with_puny = [( + "https://ü.app#help", + https_link_no_puny("https://ü.app#help", "ü.app"), + )]; for (input, expected_destination) in &test_cases_no_puny { println!("testing {input}"); let result = parse_markdown_text(input); - assert_eq!( - result.len(), - 1 - ); + assert_eq!(result.len(), 1); assert_eq!( result[0], Link { @@ -583,22 +580,10 @@ fn link() { println!("testing {}", input); match &parse_markdown_text(input)[0] { Link { destination } => { - assert_eq!( - expected_destination.target, - destination.target - ); - assert_eq!( - expected_destination.scheme, - destination.scheme - ); - assert_eq!( - expected_destination.hostname, - destination.hostname, - ); - assert_eq!( - destination.punycode.is_some(), - true - ); + assert_eq!(expected_destination.target, destination.target); + assert_eq!(expected_destination.scheme, destination.scheme); + assert_eq!(expected_destination.hostname, destination.hostname,); + assert_eq!(destination.punycode.is_some(), true); } _ => { panic!(); @@ -615,10 +600,12 @@ fn test_link_example() { ), vec![ Text("This is an my site: "), - Link { destination: https_link_no_puny( - "https://delta.chat/en/help?hi=5&e=4#section2.0", - "delta.chat" - )}, + Link { + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat" + ) + }, Linebreak, Text("Visit me there") ] @@ -646,10 +633,12 @@ fn test_delimited_link_example() { ), vec![ Text("This is an my site: <"), - Link { destination: https_link_no_puny( - "https://delta.chat/en/help?hi=5&e=4#section2.0", - "delta.chat" - )}, + Link { + destination: https_link_no_puny( + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "delta.chat" + ) + }, Text(">"), Linebreak, Text("Visit me there") @@ -691,10 +680,7 @@ fn labeled_link_example() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: https_link_no_puny( - "https://delta.chat/en/help", - "delta.chat" - ), + destination: https_link_no_puny("https://delta.chat/en/help", "delta.chat"), }, Text(".") ] @@ -709,10 +695,7 @@ fn labeled_link_can_have_comma_or_dot_at_end() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: https_link_no_puny( - "https://delta.chat/en/help.", - "delta.chat" - ), + destination: https_link_no_puny("https://delta.chat/en/help.", "delta.chat"), }, Text(".") ] @@ -723,10 +706,7 @@ fn labeled_link_can_have_comma_or_dot_at_end() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: https_link_no_puny( - "https://delta.chat/en/help,", - "delta.chat" - ), + destination: https_link_no_puny("https://delta.chat/en/help,", "delta.chat"), }, Text(".") ] @@ -737,10 +717,7 @@ fn labeled_link_can_have_comma_or_dot_at_end() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: https_link_no_puny( - "https://delta.chat/en/help:", - "delta.chat" - ), + destination: https_link_no_puny("https://delta.chat/en/help:", "delta.chat"), }, Text(".") ] @@ -751,10 +728,7 @@ fn labeled_link_can_have_comma_or_dot_at_end() { Text("you can find the details "), LabeledLink { label: vec![Text("here")], - destination: https_link_no_puny( - "https://delta.chat/en/help;", - "delta.chat" - ), + destination: https_link_no_puny("https://delta.chat/en/help;", "delta.chat"), }, Text(".") ] diff --git a/tests/text_to_ast/mod.rs b/tests/text_to_ast/mod.rs index e38532a..13737fe 100644 --- a/tests/text_to_ast/mod.rs +++ b/tests/text_to_ast/mod.rs @@ -6,7 +6,7 @@ fn http_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination< target, hostname: Some(hostname), scheme: "http", - punycode: None + punycode: None, } } @@ -15,7 +15,7 @@ fn ftp_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<' target, hostname: Some(hostname), scheme: "ftp", - punycode: None + punycode: None, } } @@ -24,7 +24,7 @@ fn https_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination target, hostname: Some(hostname), scheme: "https", - punycode: None + punycode: None, } } From ff6fdfea8fe32e8da7195ea7d1f8fdaf235710a9 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 29 Apr 2024 14:04:09 +0330 Subject: [PATCH 51/74] add new test case for internal labelled links --- tests/text_to_ast/markdown.rs | 9 +++++++++ tests/text_to_ast/mod.rs | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index cdca291..7488d76 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -670,6 +670,15 @@ fn labeled_link() { ), }] ); + assert_eq!( + parse_markdown_text( + "[internal link](#internal)" + ), + vec![LabeledLink { + label: vec![Text("internal link")], + destination: internal_link("#internal") + }] + ); } #[test] diff --git a/tests/text_to_ast/mod.rs b/tests/text_to_ast/mod.rs index 13737fe..b5ff1a1 100644 --- a/tests/text_to_ast/mod.rs +++ b/tests/text_to_ast/mod.rs @@ -1,6 +1,16 @@ use deltachat_message_parser::parser::Element::*; use deltachat_message_parser::parser::LinkDestination; + +fn internal_link<'a>(target: &'a str) -> LinkDestination<'a> { + LinkDestination { + target, + hostname: None, + scheme: "", + punycode: None, + } +} + fn http_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<'a> { LinkDestination { target, From 79316e9c6ae1d0c43f38cab7c79ec46d668587ea Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 29 Apr 2024 15:00:26 +0330 Subject: [PATCH 52/74] fix #66 --- rust-toolchain | 2 +- src/parser/link_url/mod.rs | 38 +++++++++++++++++++++++-------- src/parser/link_url/parse_link.rs | 6 ++++- src/parser/mod.rs | 2 +- 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/rust-toolchain b/rust-toolchain index 9405730..369f996 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -1.64.0 +1.77.2 diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index a1ae197..22b0a9e 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -6,12 +6,18 @@ use nom::{ IResult, Slice, }; -use crate::parser::{link_url::parse_link::parse_link, parse_from_text::base_parsers::CustomError}; +use crate::parser::{ + link_url::parse_link::{ + parse_link, + ifragment, + }, + parse_from_text::base_parsers::CustomError +}; ///! Parsing / Validation of URLs /// -/// - hyperlinks (:// scheme) -/// - whitelisted scheme (: scheme) +/// - hyperlinks (:// scheme) according to RFC3987 and RFC3988 +/// - whitelisted scheme (: scheme) according to our own simple thing :) /// /// for hyperlinks it also checks whether the domain contains punycode @@ -74,15 +80,27 @@ impl LinkDestination<'_> { } pub(crate) fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { - let (mut remaining, mut link) = Self::parse(input)?; - if let Some(first) = remaining.chars().next() { - if matches!(first, ';' | '.' | ',' | ':') { - let point = link.target.len().saturating_add(1); - link.target = input.slice(..point); - remaining = input.slice(point..); + match Self::parse(input) { + Ok((mut remaining, mut link)) => { + if let Some(first) = remaining.chars().next() { + if matches!(first, ';' | '.' | ',' | ':') { + let point = link.target.len().saturating_add(1); + link.target = input.slice(..point); + remaining = input.slice(point..); + } + } + Ok((remaining, link)) + } + Err(..) => { + let (remaining, target) = ifragment(input)?; + Ok((remaining, LinkDestination { + target, + scheme: "", + hostname: None, + punycode: None + })) } } - Ok((remaining, link)) } } diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 5eb16b1..c7f413e 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -268,6 +268,10 @@ pub fn get_puny_code_warning(link: &str, host: &str) -> Option } } +pub fn ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { + recognize(tuple((char('#'), take_while_ifragment)))(input) +} + // IRI links per RFC3987 and RFC3986 #[allow(clippy::integer_arithmetic)] fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { @@ -297,7 +301,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { // which in the third case it's down to ipath-empty(see below) let path = path.unwrap_or(""); // it's ipath-empty let (input, query) = opt(recognize(tuple((char('?'), iquery))))(input)?; - let (_, fragment) = opt(recognize(tuple((char('#'), take_while_ifragment))))(input)?; + let (_, fragment) = opt(ifragment)(input)?; let query = query.unwrap_or(""); // in the case of no iquery let fragment = fragment.unwrap_or(""); // in the case of no ifragment let ihier_len = 3usize diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 9066e6c..d7949b0 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -3,7 +3,7 @@ pub mod link_url; pub mod parse_from_text; mod utils; -pub use crate::parser::link_url::{LinkDestination, PunycodeWarning}; +pub use crate::parser::link_url::LinkDestination; /// The representation of Elements for the Abstract Syntax Tree #[derive(Debug, PartialEq, Eq, Serialize)] From d946113bce6a3804586d05cc063f1ade40114568 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Mon, 29 Apr 2024 15:32:51 +0330 Subject: [PATCH 53/74] new test cases and fix formatting --- src/parser/link_url/mod.rs | 22 +++++++++---------- tests/text_to_ast/desktop_set.rs | 36 ++++++++++++++++++++++++++++++++ tests/text_to_ast/markdown.rs | 4 +--- tests/text_to_ast/mod.rs | 8 +++++++ 4 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 22b0a9e..4cf5fdf 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -7,11 +7,8 @@ use nom::{ }; use crate::parser::{ - link_url::parse_link::{ - parse_link, - ifragment, - }, - parse_from_text::base_parsers::CustomError + link_url::parse_link::{ifragment, parse_link}, + parse_from_text::base_parsers::CustomError, }; ///! Parsing / Validation of URLs @@ -93,12 +90,15 @@ impl LinkDestination<'_> { } Err(..) => { let (remaining, target) = ifragment(input)?; - Ok((remaining, LinkDestination { - target, - scheme: "", - hostname: None, - punycode: None - })) + Ok(( + remaining, + LinkDestination { + target, + scheme: "", + hostname: None, + punycode: None, + }, + )) } } } diff --git a/tests/text_to_ast/desktop_set.rs b/tests/text_to_ast/desktop_set.rs index db11e7b..0d647c0 100644 --- a/tests/text_to_ast/desktop_set.rs +++ b/tests/text_to_ast/desktop_set.rs @@ -236,6 +236,42 @@ fn link() { "mailto:foö@ü.chat", mailto_link_no_puny("mailto:foö@ü.chat"), ), + ( + "https://delta.chat/%C3%BC%C3%A4%C3%B6", + https_link_no_puny( + "https://delta.chat/%C3%BC%C3%A4%C3%B6", + "delta.chat", + ) + ), + ( + "https://delta.chat/üäö", + https_link_no_puny( + "https://delta.chat/üäö", + "delta.chat", + ) + ), + ( + "https://90eghtesadi.com/Keywords/Index/2031708/%D9%82%D8%B1%D8%A7%D8%B1%D8%AF%D8%A7%D8%AF-%DB%B2%DB%B5-%D8%B3%D8%A7%D9%84%D9%87-%D8%A7%DB%8C%D8%B1%D8%A7%D9%86-%D9%88-%DA%86%DB%8C%D9%86", + // ^ I guess shame on the Iranian government of the time? --Farooq + https_link_no_puny( + "https://90eghtesadi.com/Keywords/Index/2031708/%D9%82%D8%B1%D8%A7%D8%B1%D8%AF%D8%A7%D8%AF-%DB%B2%DB%B5-%D8%B3%D8%A7%D9%84%D9%87-%D8%A7%DB%8C%D8%B1%D8%A7%D9%86-%D9%88-%DA%86%DB%8C%D9%86", + "90eghtesadi.com", + ) + ), + ( + "https://pcworms.ir/صفحه", + https_link_no_puny( + "https://pcworms.ir/صفحه", + "pcworms.ir", + ), + ), + ( + "gopher://republic.circumlunar.space/1/~farooqkz", + gopher_link_no_puny( + "gopher://republic.circumlunar.space/1/~farooqkz", + "republic.circumlunar.space", + ), + ), ]; let test_cases_with_puny = [( diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index 7488d76..573e57f 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -671,9 +671,7 @@ fn labeled_link() { }] ); assert_eq!( - parse_markdown_text( - "[internal link](#internal)" - ), + parse_markdown_text("[internal link](#internal)"), vec![LabeledLink { label: vec![Text("internal link")], destination: internal_link("#internal") diff --git a/tests/text_to_ast/mod.rs b/tests/text_to_ast/mod.rs index b5ff1a1..c72da49 100644 --- a/tests/text_to_ast/mod.rs +++ b/tests/text_to_ast/mod.rs @@ -1,6 +1,14 @@ use deltachat_message_parser::parser::Element::*; use deltachat_message_parser::parser::LinkDestination; +fn gopher_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<'a> { + LinkDestination { + target, + hostname: Some(hostname), + scheme: "gopher", + punycode: None, + } +} fn internal_link<'a>(target: &'a str) -> LinkDestination<'a> { LinkDestination { From 94ad0d9bb20cf7837a83ec514030b0a9de478c0b Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 16:33:34 +0330 Subject: [PATCH 54/74] micro optimization by inlining --- src/parser/utils.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/parser/utils.rs b/src/parser/utils.rs index c77ab6b..873e380 100644 --- a/src/parser/utils.rs +++ b/src/parser/utils.rs @@ -50,15 +50,17 @@ pub fn is_in_one_of_ranges(c: u32, ranges: &[RangeInclusive]) -> bool { } } -// TODO: Convert these(is_alpha, is_hex_digit, is_digit) to macros OR inline +#[inline(always)] pub(crate) fn is_alpha(c: char) -> bool { c.is_alphabetic() } +#[inline(always)] pub(crate) fn is_hex_digit(c: char) -> bool { c.is_ascii_hexdigit() } +#[inline(always)] pub(crate) fn is_digit(c: char) -> bool { c.is_ascii_digit() } From 29d3ff8b9b1bb6082dd7a91418683e0f0644bd41 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 18:11:36 +0330 Subject: [PATCH 55/74] add new test cases --- tests/text_to_ast/markdown.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index 573e57f..b635484 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -557,6 +557,27 @@ fn link() { "mailto:foö@ü.chat", mailto_link_no_puny("mailto:foö@ü.chat"), ), + ( + "gopher://[::1]/", + gopher_link_no_puny( + "gopher://[::1]/", + "[::1]", + ), + ), + ( + "https://[2345:0425:2CA1:0000:0000:0567:5673:23b5]/hello_world", + https_link_no_puny( + "https://[2345:0425:2CA1:0000:0000:0567:5673:23b5]/hello_world", + "[2345:0425:2CA1:0000:0000:0567:5673:23b5]", + ), + ), + ( + "https://[2345:425:2CA1:0:0:0567:5673:23b5]/hello_world", + https_link_no_puny( + "https://[2345:425:2CA1:0:0:0567:5673:23b5]/hello_world", + "[2345:425:2CA1:0:0:0567:5673:23b5]", + ), + ) ]; let test_cases_with_puny = [( From 9a8a6eabcb0377cbbce4b54707f1f36503f84e61 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 18:15:54 +0330 Subject: [PATCH 56/74] add commments about parse_labelled method of LinkDestination --- src/parser/link_url/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 4cf5fdf..9cfa674 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -75,12 +75,15 @@ impl LinkDestination<'_> { Err(nom::Err::Error(CustomError::InvalidLink)) } } - + + // This is for parsing markdown labelled links. pub(crate) fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { match Self::parse(input) { Ok((mut remaining, mut link)) => { if let Some(first) = remaining.chars().next() { if matches!(first, ';' | '.' | ',' | ':') { + // ^ markdown labelled links can include one of these characters at the end + // and it's therefore part of the link let point = link.target.len().saturating_add(1); link.target = input.slice(..point); remaining = input.slice(point..); From a843a147a607658c6dd1e6d987d2aa12c3b72732 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 18:24:39 +0330 Subject: [PATCH 57/74] add benchmarking links --- benches/moar_links.txt | 20 ++++++++++++++++++++ benches/my_benchmark.rs | 12 +++++++++++- src/parser/link_url/ip/ipv6.rs | 3 --- src/parser/link_url/mod.rs | 4 ++-- 4 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 benches/moar_links.txt diff --git a/benches/moar_links.txt b/benches/moar_links.txt new file mode 100644 index 0000000..d5bb877 --- /dev/null +++ b/benches/moar_links.txt @@ -0,0 +1,20 @@ +Let's add some more links just for testing and benching: + +these are some IPv6 links: + +gopher://[::1]/ +https://[::1]/سلام +https://[2345:0425:2CA1:0000:0000:0567:5673:23b5]/hello_world +https://[2345:425:2CA1:0:0:0567:5673:23b5]/hello_world + +an IPvfuture link: +ftp://mrchickenkiller@[vA.A]/var/log/boot.log + +some normal links: + +https://www.ietf.org/rfc/rfc3987.txt +https://iamb.chat/messages/index.html +https://github.com/deltachat/message-parser/issues/67 +https://far.chickenkiller.com +gopher://republic.circumlunar.space +https://far.chickenkiller.com/religion/a-god-who-does-not-care/ diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs index 2eb25c7..8410354 100644 --- a/benches/my_benchmark.rs +++ b/benches/my_benchmark.rs @@ -1,10 +1,16 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use deltachat_message_parser::parser::{parse_desktop_set, parse_markdown_text, parse_only_text}; +use deltachat_message_parser::parser::{ + LinkDestination, + parse_desktop_set, + parse_markdown_text, + parse_only_text +}; pub fn criterion_benchmark(c: &mut Criterion) { let testdata = include_str!("testdata.md"); let lorem_ipsum_txt = include_str!("lorem_ipsum.txt"); let r10s_update_message = include_str!("r10s_update_message.txt"); + let links = include_str!("moar_links.txt"); c.bench_function("only_text_lorem_ipsum.txt", |b| { b.iter(|| parse_only_text(black_box(lorem_ipsum_txt))) @@ -35,6 +41,10 @@ pub fn criterion_benchmark(c: &mut Criterion) { c.bench_function("markdown_r10s_update_message.txt", |b| { b.iter(|| parse_markdown_text(black_box(r10s_update_message))) }); + + c.bench_function("parse_link_moar_links.txt", |b| { + b.iter(|| LinkDestination::parse(black_box(links))) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/src/parser/link_url/ip/ipv6.rs b/src/parser/link_url/ip/ipv6.rs index 340b592..7e0b05c 100644 --- a/src/parser/link_url/ip/ipv6.rs +++ b/src/parser/link_url/ip/ipv6.rs @@ -12,8 +12,6 @@ use crate::parser::{parse_from_text::base_parsers::CustomError, utils::is_hex_di use super::ipv4::ipv4; -// consume 1 to 4 hex digit(s) -// TODO These 4 functions should be macros instead fn h16(input: &str) -> IResult<&str, &str, CustomError<&str>> { take_while_m_n(1, 4, is_hex_digit)(input) } @@ -28,7 +26,6 @@ fn ls32(input: &str) -> IResult<&str, &str, CustomError<&str>> { } } -// consume fn h16_and_period(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((h16, char(':'))))(input) } diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 9cfa674..bae4124 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -68,7 +68,7 @@ impl LinkDestination<'_> { } } */ - pub(crate) fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { + pub fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { if let Ok((rest, link_destination)) = parse_link(input) { Ok((rest, link_destination)) } else { @@ -77,7 +77,7 @@ impl LinkDestination<'_> { } // This is for parsing markdown labelled links. - pub(crate) fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { + pub fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { match Self::parse(input) { Ok((mut remaining, mut link)) => { if let Some(first) = remaining.chars().next() { From bf909a197ba3f11ecac16f52e0c38cf7c64513cf Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 18:26:18 +0330 Subject: [PATCH 58/74] correct formatting --- benches/my_benchmark.rs | 5 +---- src/parser/link_url/mod.rs | 2 +- tests/text_to_ast/markdown.rs | 7 ++----- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/benches/my_benchmark.rs b/benches/my_benchmark.rs index 8410354..13bedf5 100644 --- a/benches/my_benchmark.rs +++ b/benches/my_benchmark.rs @@ -1,9 +1,6 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; use deltachat_message_parser::parser::{ - LinkDestination, - parse_desktop_set, - parse_markdown_text, - parse_only_text + parse_desktop_set, parse_markdown_text, parse_only_text, LinkDestination, }; pub fn criterion_benchmark(c: &mut Criterion) { diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index bae4124..a017d99 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -75,7 +75,7 @@ impl LinkDestination<'_> { Err(nom::Err::Error(CustomError::InvalidLink)) } } - + // This is for parsing markdown labelled links. pub fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { match Self::parse(input) { diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index b635484..5e1f1db 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -559,10 +559,7 @@ fn link() { ), ( "gopher://[::1]/", - gopher_link_no_puny( - "gopher://[::1]/", - "[::1]", - ), + gopher_link_no_puny("gopher://[::1]/", "[::1]"), ), ( "https://[2345:0425:2CA1:0000:0000:0567:5673:23b5]/hello_world", @@ -577,7 +574,7 @@ fn link() { "https://[2345:425:2CA1:0:0:0567:5673:23b5]/hello_world", "[2345:425:2CA1:0:0:0567:5673:23b5]", ), - ) + ), ]; let test_cases_with_puny = [( From 060f57c9692b0ae919caf384bba59a56229f831f Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 18:29:34 +0330 Subject: [PATCH 59/74] a fix for rustdoc errors --- src/parser/link_url/parse_link.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index c7f413e..ae53334 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -121,7 +121,7 @@ fn take_while_ireg(input: &str) -> IResult<&str, &str, CustomError<&str>> { /// Parse the iauthority block /// # Description /// An iauthority is... -/// [iuserinfo] [:port] +/// `[iuserinfo] [:port]` /// # Return value /// unconsumed string AND `(iauthority, host, is_ipliteral)` where `ipliteral` is a boolean fn iauthority(input: &str) -> IResult<&str, (&str, &str, bool), CustomError<&str>> /* (iauthority, host, bool) */ From 513876da934046630986985eb7c7c9535ce81d22 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 18:33:18 +0330 Subject: [PATCH 60/74] fix clippy issues --- src/lib.rs | 2 +- src/parser/link_url/mod.rs | 23 +++++++++++---------- src/parser/link_url/parse_link.rs | 2 +- src/parser/parse_from_text/base_parsers.rs | 2 +- src/parser/parse_from_text/text_elements.rs | 1 - src/parser/utils.rs | 4 ++-- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 20ec82a..906aecf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,7 +9,7 @@ clippy::get_last_with_len, clippy::get_unwrap, clippy::get_unwrap, - clippy::integer_arithmetic, + clippy::arithmetic_side_effects, clippy::match_on_vec_items, clippy::match_wild_err_arm, clippy::missing_panics_doc, diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index a017d99..04de554 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -11,17 +11,18 @@ use crate::parser::{ parse_from_text::base_parsers::CustomError, }; -///! Parsing / Validation of URLs -/// -/// - hyperlinks (:// scheme) according to RFC3987 and RFC3988 -/// - whitelisted scheme (: scheme) according to our own simple thing :) -/// -/// for hyperlinks it also checks whether the domain contains punycode - -// There are two kinds of Urls -// - Common Internet Scheme[1] -// - Every other url (like mailto) -// [1] RFC1738(Section 3.1), RFC3987, RFC3988 --Farooq +/* Parsing / Validation of URLs + * + * - hyperlinks (:// scheme) according to RFC3987 and RFC3988 + * - whitelisted scheme (: scheme) according to our own simple thing :) + * + * for hyperlinks it also checks whether the domain contains punycode + * + * There are two kinds of Urls + * - Common Internet Scheme[1] + * - Every other url (like mailto) + * [1] RFC1738(Section 3.1), RFC3987, RFC3988 --Farooq + */ #[derive(Debug, PartialEq, Eq, Serialize, Clone)] pub struct LinkDestination<'a> { diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index ae53334..7735597 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -273,7 +273,7 @@ pub fn ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { } // IRI links per RFC3987 and RFC3986 -#[allow(clippy::integer_arithmetic)] +#[allow(clippy::arithmetic_side_effects)] fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let input_ = <&str>::clone(&input); // a link is :// [ipath] [iquery] [ifragment] diff --git a/src/parser/parse_from_text/base_parsers.rs b/src/parser/parse_from_text/base_parsers.rs index 7827a47..e5cb491 100644 --- a/src/parser/parse_from_text/base_parsers.rs +++ b/src/parser/parse_from_text/base_parsers.rs @@ -1,6 +1,6 @@ use std::fmt::Debug; -///! Base utility parsers, used by both text and markdown parsers +// Base utility parsers, used by both text and markdown parsers use nom::{ bytes::complete::tag, error::{ErrorKind, ParseError}, diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index aff6d92..aeb222f 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -1,4 +1,3 @@ -///! nom parsers for text elements use nom::{ bytes::{ complete::{tag, take, take_while, take_while1}, diff --git a/src/parser/utils.rs b/src/parser/utils.rs index 873e380..aacbe92 100644 --- a/src/parser/utils.rs +++ b/src/parser/utils.rs @@ -25,12 +25,12 @@ fn find_range_for_char(code: u32, ranges: &'_ [RangeInclusive]) -> FindRang match index { Ok(_) => FindRangeResult::WasOnRangeStart, Err(index) => match index { - #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] + #[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)] 0 => FindRangeResult::Range(&ranges[0]), // Since `index` can never be 0, `index - 1` will never overflow. Furthermore, the // maximum value which the binary search function returns is `NUMBER_OF_RANGES`. // Therefore, `index - 1` will never panic if we index the array with it. - #[allow(clippy::integer_arithmetic, clippy::indexing_slicing)] + #[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)] index => FindRangeResult::Range(&ranges[index - 1]), }, } From b7fef4ec328d548f765262f24bbfce52f36d1281 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 19:20:28 +0330 Subject: [PATCH 61/74] improve tests --- src/parser/link_url/parse_link.rs | 160 ------------------------------ tests/links.rs | 154 ++++++++++++++++++++++++++++ tests/test.rs | 1 + tests/text_to_ast/desktop_set.rs | 4 +- tests/text_to_ast/markdown.rs | 2 +- 5 files changed, 158 insertions(+), 163 deletions(-) create mode 100644 tests/links.rs diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 7735597..2f1a7f9 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -447,163 +447,3 @@ fn parse_generic(input: &str) -> IResult<&str, LinkDestination, CustomError<&str pub(super) fn parse_link(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { alt((parse_generic, parse_iri))(input) } - -#[cfg(test)] -mod test { - #![allow(clippy::unwrap_used)] - use crate::parser::{ - link_url::parse_link::{punycode_encode, PunycodeWarning}, - LinkDestination, - }; - - #[test] - fn basic_parsing() { - let test_cases_no_puny = vec![ - "http://delta.chat", - "http://delta.chat:8080", - "http://localhost", - "http://127.0.0.0", - "https://[::1]/", - "https://[::1]:9000?hi#o", - "https://delta.chat", - "ftp://delta.chat", - "https://delta.chat/en/help", - "https://delta.chat/en/help?hi=5&e=4", - "https://delta.chat?hi=5&e=4", - "https://delta.chat/en/help?hi=5&e=4#section2.0", - "https://delta#section2.0", - "http://delta.chat:8080?hi=5&e=4#section2.0", - "http://delta.chat:8080#section2.0", - "mailto:delta@example.com", - "mailto:delta@example.com?subject=hi&body=hello%20world", - "mailto:foö@ü.chat", - "ftp://test-test", - ]; - - let test_cases_with_puny = vec!["https://ü.app#help", "http://münchen.de"]; - - for input in &test_cases_no_puny { - let (rest, link_destination) = - LinkDestination::parse(input).expect(&format!("Test failed: {input}")); - - assert_eq!(input, &link_destination.target); - assert_eq!(rest.len(), 0); - assert!(link_destination.punycode.is_none()); - } - - for input in &test_cases_with_puny { - let (rest, link_destination) = - LinkDestination::parse(input).expect("Test failed: {input}"); - - assert!(link_destination.punycode.is_some()); - assert_eq!(rest.len(), 0); - assert_eq!(input, &link_destination.target); - } - } - - #[test] - fn invalid_domains() { - let test_cases = vec![";?:/hi", "##://thing"]; - - for input in &test_cases { - println!("testing {input}"); - assert!(LinkDestination::parse(input).is_err()); - } - } - #[test] - fn punycode_encode_fn() { - assert_eq!(punycode_encode("münchen.de"), "xn--mnchen-3ya.de") - } - - #[test] - fn punycode_detection() { - assert_eq!( - LinkDestination::parse("http://münchen.de").unwrap().1, - LinkDestination { - hostname: Some("münchen.de"), - target: "http://münchen.de", - scheme: "http", - punycode: Some(PunycodeWarning { - original_hostname: "münchen.de".to_owned(), - ascii_hostname: "xn--mnchen-3ya.de".to_owned(), - punycode_encoded_url: "http://xn--mnchen-3ya.de".to_owned(), - }), - } - ); - - assert_eq!( - LinkDestination::parse("http://muenchen.de").unwrap().1, - LinkDestination { - hostname: Some("muenchen.de"), - target: "http://muenchen.de", - scheme: "http", - punycode: None, - } - ); - } - - #[test] - fn common_schemes() { - assert_eq!( - LinkDestination::parse("http://delta.chat").unwrap(), - ( - "", - LinkDestination { - hostname: Some("delta.chat"), - target: "http://delta.chat", - scheme: "http", - punycode: None, - } - ) - ); - assert_eq!( - LinkDestination::parse("https://far.chickenkiller.com").unwrap(), - ( - "", - LinkDestination { - hostname: Some("far.chickenkiller.com"), - target: "https://far.chickenkiller.com", - scheme: "https", - punycode: None, - } - ) - ); - } - #[test] - fn generic_schemes() { - assert_eq!( - LinkDestination::parse("mailto:someone@example.com").unwrap(), - ( - "", - LinkDestination { - hostname: None, - scheme: "mailto", - punycode: None, - target: "mailto:someone@example.com" - } - ) - ); - assert_eq!( - LinkDestination::parse("bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka") - .unwrap() - .1, - LinkDestination { - hostname: None, - scheme: "bitcoin", - target: "bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka", - punycode: None, - } - ); - assert_eq!( - LinkDestination::parse("geo:37.786971,-122.399677") - .unwrap() - .1, - LinkDestination { - scheme: "geo", - punycode: None, - target: "geo:37.786971,-122.399677", - hostname: None - } - ); - } -} diff --git a/tests/links.rs b/tests/links.rs new file mode 100644 index 0000000..4f522d7 --- /dev/null +++ b/tests/links.rs @@ -0,0 +1,154 @@ +#![allow(clippy::unwrap_used)] +use deltachat_message_parser::parser::{ + link_url::PunycodeWarning, + LinkDestination, +}; + +#[test] +fn basic_parsing() { + let test_cases_no_puny = vec![ + "http://delta.chat", + "http://delta.chat:8080", + "http://localhost", + "http://127.0.0.0", + "https://[::1]/", + "https://[::1]:9000?hi#o", + "https://delta.chat", + "ftp://delta.chat", + "https://delta.chat/en/help", + "https://delta.chat/en/help?hi=5&e=4", + "https://delta.chat?hi=5&e=4", + "https://delta.chat/en/help?hi=5&e=4#section2.0", + "https://delta#section2.0", + "http://delta.chat:8080?hi=5&e=4#section2.0", + "http://delta.chat:8080#section2.0", + "mailto:delta@example.com", + "mailto:delta@example.com?subject=hi&body=hello%20world", + "mailto:foö@ü.chat", + "ftp://test-test", + ]; + + let test_cases_with_puny = vec!["https://ü.app#help", "http://münchen.de"]; + + for input in &test_cases_no_puny { + let (rest, link_destination) = + LinkDestination::parse(input).expect(&format!("Test failed: {input}")); + + assert_eq!(input, &link_destination.target); + assert_eq!(rest.len(), 0); + assert!(link_destination.punycode.is_none()); + } + + for input in &test_cases_with_puny { + let Ok((rest, link_destination)) = + LinkDestination::parse(input) else { + panic!("Parsing {} as link failed", input); + }; + + assert!(link_destination.punycode.is_some()); + assert_eq!(rest.len(), 0); + assert_eq!(input, &link_destination.target); + } +} + +#[test] +fn invalid_domains() { + let test_cases = vec![";?:/hi", "##://thing"]; + + for input in &test_cases { + println!("testing {input}"); + assert!(LinkDestination::parse(input).is_err()); + } +} + +#[test] +fn punycode_detection() { + assert_eq!( + LinkDestination::parse("http://münchen.de").unwrap().1, + LinkDestination { + hostname: Some("münchen.de"), + target: "http://münchen.de", + scheme: "http", + punycode: Some(PunycodeWarning { + original_hostname: "münchen.de".to_owned(), + ascii_hostname: "xn--mnchen-3ya.de".to_owned(), + punycode_encoded_url: "http://xn--mnchen-3ya.de".to_owned(), + }), + } + ); + + assert_eq!( + LinkDestination::parse("http://muenchen.de").unwrap().1, + LinkDestination { + hostname: Some("muenchen.de"), + target: "http://muenchen.de", + scheme: "http", + punycode: None, + } + ); +} + +#[test] +fn common_schemes() { + assert_eq!( + LinkDestination::parse("http://delta.chat").unwrap(), + ( + "", + LinkDestination { + hostname: Some("delta.chat"), + target: "http://delta.chat", + scheme: "http", + punycode: None, + } + ) + ); + assert_eq!( + LinkDestination::parse("https://far.chickenkiller.com").unwrap(), + ( + "", + LinkDestination { + hostname: Some("far.chickenkiller.com"), + target: "https://far.chickenkiller.com", + scheme: "https", + punycode: None, + } + ) + ); +} +#[test] +fn generic_schemes() { + assert_eq!( + LinkDestination::parse("mailto:someone@example.com").unwrap(), + ( + "", + LinkDestination { + hostname: None, + scheme: "mailto", + punycode: None, + target: "mailto:someone@example.com" + } + ) + ); + assert_eq!( + LinkDestination::parse("bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka") + .unwrap() + .1, + LinkDestination { + hostname: None, + scheme: "bitcoin", + target: "bitcoin:bc1qt3xhfvwmdqvxkk089tllvvtzqs8ts06u3u6qka", + punycode: None, + } + ); + assert_eq!( + LinkDestination::parse("geo:37.786971,-122.399677") + .unwrap() + .1, + LinkDestination { + scheme: "geo", + punycode: None, + target: "geo:37.786971,-122.399677", + hostname: None + } + ); +} diff --git a/tests/test.rs b/tests/test.rs index f37c391..f1a1878 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1 +1,2 @@ mod text_to_ast; +mod links; diff --git a/tests/text_to_ast/desktop_set.rs b/tests/text_to_ast/desktop_set.rs index 0d647c0..eb1ff84 100644 --- a/tests/text_to_ast/desktop_set.rs +++ b/tests/text_to_ast/desktop_set.rs @@ -295,8 +295,8 @@ fn link() { Link { destination } => { assert_eq!(expected_destination.target, destination.target); assert_eq!(expected_destination.scheme, destination.scheme); - assert_eq!(expected_destination.hostname, destination.hostname,); - assert_eq!(destination.punycode.is_some(), true); + assert_eq!(expected_destination.hostname, destination.hostname); + assert!(destination.punycode.is_some()); } _ => { panic!(); diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index 5e1f1db..cd04167 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -601,7 +601,7 @@ fn link() { assert_eq!(expected_destination.target, destination.target); assert_eq!(expected_destination.scheme, destination.scheme); assert_eq!(expected_destination.hostname, destination.hostname,); - assert_eq!(destination.punycode.is_some(), true); + assert!(destination.punycode.is_some()); } _ => { panic!(); From 96383815a663835b271026716b7cbc229bdfec76 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 19:27:06 +0330 Subject: [PATCH 62/74] Fix another clippy thing --- tests/links.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/links.rs b/tests/links.rs index 4f522d7..a0ead8d 100644 --- a/tests/links.rs +++ b/tests/links.rs @@ -32,7 +32,7 @@ fn basic_parsing() { for input in &test_cases_no_puny { let (rest, link_destination) = - LinkDestination::parse(input).expect(&format!("Test failed: {input}")); + LinkDestination::parse(input).unwrap_or_else(|_| panic!("Cannot parse link: {}", input)); assert_eq!(input, &link_destination.target); assert_eq!(rest.len(), 0); From 305f6f9eb47227e4244b1aa1723d7d4a57136733 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 19:27:27 +0330 Subject: [PATCH 63/74] Fix clippy issues --- tests/text_to_ast/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/text_to_ast/mod.rs b/tests/text_to_ast/mod.rs index c72da49..b0f76a5 100644 --- a/tests/text_to_ast/mod.rs +++ b/tests/text_to_ast/mod.rs @@ -10,7 +10,7 @@ fn gopher_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestinatio } } -fn internal_link<'a>(target: &'a str) -> LinkDestination<'a> { +fn internal_link(target: &str) -> LinkDestination<'_> { LinkDestination { target, hostname: None, @@ -46,7 +46,7 @@ fn https_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination } } -fn mailto_link_no_puny<'a>(target: &'a str) -> LinkDestination<'a> { +fn mailto_link_no_puny(target: &str) -> LinkDestination<'_> { LinkDestination { target, hostname: None, From 829ba3d5ec6e2b873fd85ee40f0327d7de791c6e Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 19:28:43 +0330 Subject: [PATCH 64/74] fix formatting --- tests/links.rs | 16 ++++++---------- tests/test.rs | 2 +- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/links.rs b/tests/links.rs index a0ead8d..a743ae5 100644 --- a/tests/links.rs +++ b/tests/links.rs @@ -1,8 +1,5 @@ #![allow(clippy::unwrap_used)] -use deltachat_message_parser::parser::{ - link_url::PunycodeWarning, - LinkDestination, -}; +use deltachat_message_parser::parser::{link_url::PunycodeWarning, LinkDestination}; #[test] fn basic_parsing() { @@ -31,8 +28,8 @@ fn basic_parsing() { let test_cases_with_puny = vec!["https://ü.app#help", "http://münchen.de"]; for input in &test_cases_no_puny { - let (rest, link_destination) = - LinkDestination::parse(input).unwrap_or_else(|_| panic!("Cannot parse link: {}", input)); + let (rest, link_destination) = LinkDestination::parse(input) + .unwrap_or_else(|_| panic!("Cannot parse link: {}", input)); assert_eq!(input, &link_destination.target); assert_eq!(rest.len(), 0); @@ -40,10 +37,9 @@ fn basic_parsing() { } for input in &test_cases_with_puny { - let Ok((rest, link_destination)) = - LinkDestination::parse(input) else { - panic!("Parsing {} as link failed", input); - }; + let Ok((rest, link_destination)) = LinkDestination::parse(input) else { + panic!("Parsing {} as link failed", input); + }; assert!(link_destination.punycode.is_some()); assert_eq!(rest.len(), 0); diff --git a/tests/test.rs b/tests/test.rs index f1a1878..2aff94a 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,2 +1,2 @@ -mod text_to_ast; mod links; +mod text_to_ast; From a0203f4363e504cbe5d32a846a9c8770d6442cf7 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Tue, 30 Apr 2024 19:31:27 +0330 Subject: [PATCH 65/74] upgrade rust toolchain to 1.77.2 --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2856326..800d3c0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: 1.64.0 + toolchain: 1.77.2 override: true - run: rustup component add rustfmt - uses: actions-rs/cargo@v1 @@ -31,7 +31,7 @@ jobs: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 with: - toolchain: 1.64.0 + toolchain: 1.77.2 components: clippy override: true - uses: actions-rs/clippy-check@v1 @@ -68,9 +68,9 @@ jobs: matrix: include: - os: ubuntu-latest - rust: 1.64.0 + rust: 1.77.2 - os: windows-latest - rust: 1.64.0 + rust: 1.77.2 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@master From abe9bec051f0da830888cd7633d067dcc1847470 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 5 May 2024 12:15:51 +0330 Subject: [PATCH 66/74] don't parse internal markdown links. see #66 in the repo for details --- src/parser/link_url/mod.rs | 35 +++++++++---------------------- src/parser/link_url/parse_link.rs | 2 +- tests/text_to_ast/markdown.rs | 7 ------- tests/text_to_ast/mod.rs | 9 -------- 4 files changed, 11 insertions(+), 42 deletions(-) diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 04de554..9e7245b 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -7,7 +7,7 @@ use nom::{ }; use crate::parser::{ - link_url::parse_link::{ifragment, parse_link}, + link_url::parse_link::parse_link, parse_from_text::base_parsers::CustomError, }; @@ -79,32 +79,17 @@ impl LinkDestination<'_> { // This is for parsing markdown labelled links. pub fn parse_labelled(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { - match Self::parse(input) { - Ok((mut remaining, mut link)) => { - if let Some(first) = remaining.chars().next() { - if matches!(first, ';' | '.' | ',' | ':') { - // ^ markdown labelled links can include one of these characters at the end - // and it's therefore part of the link - let point = link.target.len().saturating_add(1); - link.target = input.slice(..point); - remaining = input.slice(point..); - } - } - Ok((remaining, link)) - } - Err(..) => { - let (remaining, target) = ifragment(input)?; - Ok(( - remaining, - LinkDestination { - target, - scheme: "", - hostname: None, - punycode: None, - }, - )) + let (mut remaining, mut link) = Self::parse(input)?; + if let Some(first) = remaining.chars().next() { + if matches!(first, ';' | '.' | ',' | ':') { + // ^ markdown labelled links can include one of these characters at the end + // and it's therefore part of the link + let point = link.target.len().saturating_add(1); + link.target = input.slice(..point); + remaining = input.slice(point..); } } + Ok((remaining, link)) } } diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 7735597..18464db 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -268,7 +268,7 @@ pub fn get_puny_code_warning(link: &str, host: &str) -> Option } } -pub fn ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { +fn ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((char('#'), take_while_ifragment)))(input) } diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index 5e1f1db..24a88a1 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -688,13 +688,6 @@ fn labeled_link() { ), }] ); - assert_eq!( - parse_markdown_text("[internal link](#internal)"), - vec![LabeledLink { - label: vec![Text("internal link")], - destination: internal_link("#internal") - }] - ); } #[test] diff --git a/tests/text_to_ast/mod.rs b/tests/text_to_ast/mod.rs index c72da49..53379bf 100644 --- a/tests/text_to_ast/mod.rs +++ b/tests/text_to_ast/mod.rs @@ -10,15 +10,6 @@ fn gopher_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestinatio } } -fn internal_link<'a>(target: &'a str) -> LinkDestination<'a> { - LinkDestination { - target, - hostname: None, - scheme: "", - punycode: None, - } -} - fn http_link_no_puny<'a>(target: &'a str, hostname: &'a str) -> LinkDestination<'a> { LinkDestination { target, From 46fe1157a77d57c68070885f1af15d0885bbe4ac Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 5 May 2024 12:39:14 +0330 Subject: [PATCH 67/74] move parenthesis, bracket and angle parsing into dedicated function --- src/parser/link_url/mod.rs | 5 +- src/parser/link_url/parse_link.rs | 134 ++++++++++++++---------------- 2 files changed, 63 insertions(+), 76 deletions(-) diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 9e7245b..3900c1b 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -6,10 +6,7 @@ use nom::{ IResult, Slice, }; -use crate::parser::{ - link_url::parse_link::parse_link, - parse_from_text::base_parsers::CustomError, -}; +use crate::parser::{link_url::parse_link::parse_link, parse_from_text::base_parsers::CustomError}; /* Parsing / Validation of URLs * diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 8520b1a..c060df4 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -272,6 +272,67 @@ fn ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((char('#'), take_while_ifragment)))(input) } +macro_rules! link_correct { + ($a: expr, $b: expr, $c: expr, $d: expr) => { + // for opening ones + { + $a = $a.saturating_add(1); + if $d.slice($c..).find($b).is_none() { + return Some($c); + } + } + }; + ($a: expr, $b: expr) => { + // for closing ones + { + if $a == 0 { + return Some($b); + } else { + $a = $a.saturating_sub(1); + } + } + }; +} + +// TODO: better name for this function +fn get_correct_link(link: &str) -> Option { + let mut parenthes = 0usize; // () + let mut curly_bracket = 0usize; // {} + let mut bracket = 0usize; // [] + let mut angle = 0usize; // <> + + for (i, ch) in link.chars().enumerate() { + match ch { + '(' => { + link_correct!(parenthes, ')', i, link); + } + '{' => { + link_correct!(curly_bracket, '}', i, link); + } + '[' => { + link_correct!(bracket, ']', i, link); + } + '<' => { + link_correct!(angle, '>', i, link); + } + ')' => { + link_correct!(parenthes, i); + } + ']' => { + link_correct!(bracket, i); + } + '}' => { + link_correct!(curly_bracket, i); + } + '>' => { + link_correct!(angle, i); + } + _ => continue, + } + } + None +} + // IRI links per RFC3987 and RFC3986 #[allow(clippy::arithmetic_side_effects)] fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { @@ -322,78 +383,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { host = input_.slice(scheme.len() + 3..input_.len() - 1); } } - - let mut parenthes = 0usize; // () - let mut curly_bracket = 0usize; // {} - let mut bracket = 0usize; // [] - let mut angle = 0usize; // <> - - for (i, ch) in link.chars().enumerate() { - match ch { - '(' => { - parenthes = parenthes.saturating_add(1); - if link.slice(i..).find(')').is_none() { - len = i; - break; - } - } - '{' => { - curly_bracket = curly_bracket.saturating_add(1); - if link.slice(i..).find('}').is_none() { - len = i; - break; - } - } - '[' => { - bracket = bracket.saturating_add(1); - if link.slice(i..).find(']').is_none() { - len = i; - break; - } - } - '<' => { - angle = angle.saturating_add(1); - if link.slice(i..).find('>').is_none() { - len = i; - break; - } - } - ')' => { - if parenthes == 0 { - len = i; - break; - } else { - parenthes -= 1; - } - } - ']' => { - if bracket == 0 { - len = i; - break; - } else { - bracket -= 1; - } - } - '}' => { - if curly_bracket == 0 { - len = i; - break; - } else { - curly_bracket -= 1; - } - } - '>' => { - if angle == 0 { - len = i; - break; - } else { - angle -= 1; - } - } - _ => continue, - } - } - + len = get_correct_link(link).unwrap_or(len); let link = input_.slice(0..len); let input = input_.slice(len..); From 7c6ff77e57453cdb337c48adb13c3ccf98a228b0 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 5 May 2024 12:42:00 +0330 Subject: [PATCH 68/74] remove unused clippy ignore --- src/parser/link_url/parse_link.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index c060df4..2173952 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -334,7 +334,6 @@ fn get_correct_link(link: &str) -> Option { } // IRI links per RFC3987 and RFC3986 -#[allow(clippy::arithmetic_side_effects)] fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { let input_ = <&str>::clone(&input); // a link is :// [ipath] [iquery] [ifragment] From ef139802b4a6ddced4f2caab563263888e66a442 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 5 May 2024 12:57:05 +0330 Subject: [PATCH 69/74] fix clippy issues --- src/parser/link_url/parse_link.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 2173952..9ff1d93 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -377,9 +377,9 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { // compute length of link which is ihier_len + scheme + query + fragment if let Some(link) = input_.get(0..len) { if link.ends_with([':', ';', '.', ',']) { - len -= 1; + len = len.saturating_sub(1); if path.is_empty() && query.is_empty() && fragment.is_empty() { - host = input_.slice(scheme.len() + 3..input_.len() - 1); + host = input_.slice(scheme.len().saturating_add(3)..input_.len().saturating_sub(1)); } } len = get_correct_link(link).unwrap_or(len); From 3c7726e3173ce6aacf60963097ceb97b311446bc Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Tue, 7 May 2024 12:34:28 +0200 Subject: [PATCH 70/74] fix delimited link --- src/parser/parse_from_text/markdown_elements.rs | 9 +-------- tests/text_to_ast/markdown.rs | 3 +-- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index bbbf103..db9c205 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -94,14 +94,7 @@ pub(crate) fn delimited_email_address(input: &str) -> IResult<&str, Element, Cus // pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError<&str>> { - let (input, content): (&str, &str) = delimited(tag("<"), is_not(">"), tag(">"))(input)?; - if content.is_empty() { - return Err(nom::Err::Error(CustomError::NoContent)); - } - let (rest, destination) = LinkDestination::parse(input)?; - if !rest.is_empty() { - return Err(nom::Err::Error(CustomError::UnexpectedContent)); - } + let (input, (_, destination, _)): (&str, (&str, LinkDestination, &str)) = tuple((tag("<"), LinkDestination::parse_labelled , tag(">")))(input)?; Ok((input, Element::Link { destination })) } diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index 1410cff..c0e8e84 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -650,14 +650,13 @@ fn test_delimited_link_example() { "This is an my site: \nVisit me there" ), vec![ - Text("This is an my site: <"), + Text("This is an my site: "), Link { destination: https_link_no_puny( "https://delta.chat/en/help?hi=5&e=4#section2.0", "delta.chat" ) }, - Text(">"), Linebreak, Text("Visit me there") ] From 957f08f86e139065a90c7333bbcb5edaa5727330 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Tue, 7 May 2024 12:36:06 +0200 Subject: [PATCH 71/74] cargo fmt --- src/parser/parse_from_text/markdown_elements.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index db9c205..dbc5ec8 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -2,7 +2,7 @@ use nom::{ bytes::complete::{is_not, tag, take, take_while}, character::complete::alphanumeric1, combinator::{opt, peek, recognize}, - sequence::delimited, + sequence::{delimited, tuple}, IResult, }; @@ -94,7 +94,8 @@ pub(crate) fn delimited_email_address(input: &str) -> IResult<&str, Element, Cus // pub(crate) fn delimited_link(input: &str) -> IResult<&str, Element, CustomError<&str>> { - let (input, (_, destination, _)): (&str, (&str, LinkDestination, &str)) = tuple((tag("<"), LinkDestination::parse_labelled , tag(">")))(input)?; + let (input, (_, destination, _)): (&str, (&str, LinkDestination, &str)) = + tuple((tag("<"), LinkDestination::parse_labelled, tag(">")))(input)?; Ok((input, Element::Link { destination })) } From 63628d861b3063563467b99c25d650185862fc7c Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Tue, 7 May 2024 12:45:32 +0200 Subject: [PATCH 72/74] fix parenthesis in target of labeled link --- message_parser_wasm/src/lib.rs | 2 +- src/parser/parse_from_text/markdown_elements.rs | 15 ++++----------- tests/text_to_ast/markdown.rs | 11 +++++++++++ 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/message_parser_wasm/src/lib.rs b/message_parser_wasm/src/lib.rs index cb6d75f..e6882ee 100644 --- a/message_parser_wasm/src/lib.rs +++ b/message_parser_wasm/src/lib.rs @@ -23,7 +23,7 @@ pub fn parse_text(s: &str, enable_markdown: bool) -> JsValue { serde_wasm_bindgen::to_value(&ast).expect("Element converts to JsValue") } -/// parses text to json AST (text elements and labled links, to replicate current desktop implementation) +/// parses text to json AST (text elements and labeled links, to replicate current desktop implementation) #[wasm_bindgen] pub fn parse_desktop_set(s: &str) -> JsValue { serde_wasm_bindgen::to_value(&deltachat_message_parser::parser::parse_desktop_set(s)) diff --git a/src/parser/parse_from_text/markdown_elements.rs b/src/parser/parse_from_text/markdown_elements.rs index dbc5ec8..a239839 100644 --- a/src/parser/parse_from_text/markdown_elements.rs +++ b/src/parser/parse_from_text/markdown_elements.rs @@ -107,17 +107,10 @@ pub(crate) fn labeled_link(input: &str) -> IResult<&str, Element, CustomError<&s } let label = parse_all(raw_label); - let (input, raw_link): (&str, &str) = delimited(tag("("), is_not(")"), tag(")"))(input)?; - if raw_link.is_empty() { - return Err(nom::Err::Error(CustomError::NoContent)); - } - // check if result is valid link - let (remainder, destination) = LinkDestination::parse_labelled(raw_link)?; - if remainder.is_empty() { - Ok((input, Element::LabeledLink { label, destination })) - } else { - Err(nom::Err::Error(CustomError::InvalidLink)) - } + let (input, (_, destination, _)) = + tuple((tag("("), LinkDestination::parse_labelled, tag(")")))(input)?; + + Ok((input, Element::LabeledLink { label, destination })) } pub(crate) fn parse_element( diff --git a/tests/text_to_ast/markdown.rs b/tests/text_to_ast/markdown.rs index c0e8e84..fe19090 100644 --- a/tests/text_to_ast/markdown.rs +++ b/tests/text_to_ast/markdown.rs @@ -689,6 +689,17 @@ fn labeled_link() { ); } +#[test] +fn labeled_link_parenthesis_in_target() { + assert_eq!( + parse_markdown_text("[a link](https://delta.chat/en/help(help)hi)"), + vec![LabeledLink { + label: vec![Text("a link")], + destination: https_link_no_puny("https://delta.chat/en/help(help)hi", "delta.chat"), + }] + ); +} + #[test] fn labeled_link_example() { assert_eq!( From 3f82aad21dc7367f416c99d5674d80ed7f2290e1 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Tue, 7 May 2024 12:50:29 +0200 Subject: [PATCH 73/74] remove dead commented out code and update doc comment --- src/parser/link_url/mod.rs | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index 3900c1b..9473ba2 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -42,30 +42,9 @@ pub struct PunycodeWarning { impl LinkDestination<'_> { /// parse a link that is not in a delimited link or a labled link, just a part of normal text - /// it has a whitelist of schemes, because otherwise - /* - pub(crate) fn parse_standalone_with_whitelist( - input: &str, - ) -> IResult<&str, LinkDestination, CustomError<&str>> { - if let Ok((rest, link_destination)) = parse_link(input) { - if link_destination.hostname.is_none() { - // if it's a generic url like geo:-15.5,41.1 - if !is_allowed_generic_scheme(link_destination.scheme) { - Err(nom::Err::Error(CustomError::InvalidLink)) - } else { - Ok((rest, link_destination)) - } - } else { - Ok(( - rest, - link_destination - )) - } - } else { - Err(nom::Err::Error(CustomError::InvalidLink)) - } - } - */ + /// + /// - for generic schemes (schemes without `://`) this uses a whitelist not reduce false positives + /// - it also ignores the last punctuation sign if it is at the end of the link pub fn parse(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { if let Ok((rest, link_destination)) = parse_link(input) { Ok((rest, link_destination)) From f58dac6735fbb4ec7fd089826a4ebf85eb221724 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Tue, 7 May 2024 12:54:21 +0200 Subject: [PATCH 74/74] add links to link parsing standards to spec --- spec.md | 1 + 1 file changed, 1 insertion(+) diff --git a/spec.md b/spec.md index 8975c73..3d07967 100644 --- a/spec.md +++ b/spec.md @@ -41,6 +41,7 @@ Make email addresses clickable, opens the chat with that contact and creates it Make URLs clickable. - detect all valid hyperlink URLs that have the `://` (protocol://host). + - according to [RFC3987](https://www.rfc-editor.org/rfc/rfc3987) and [RFC3988](https://www.rfc-editor.org/rfc/rfc3988) - other links like `mailto:` (note there is just a single `:`, no `://`) will get separate parsing that includes a whitelisted protocol name, otherwise there will likely be unexpected behavior if user types `hello:world` - will be recognized as link.