From d7844c6738070ff199e4a3b5248166a45daeeabb Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Thu, 9 Jan 2025 21:31:39 +0100 Subject: [PATCH] some smaller code cleanup (#86) * remove comment with dead code * move parenthesis counter into dedicated file and change to more descriptive name also add some tests --- src/parser/link_url/mod.rs | 1 + src/parser/link_url/parenthesis_counter.rs | 81 ++++++++++++ src/parser/link_url/parse_link.rs | 65 +--------- src/parser/parse_from_text/text_elements.rs | 137 -------------------- 4 files changed, 85 insertions(+), 199 deletions(-) create mode 100644 src/parser/link_url/parenthesis_counter.rs diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index f82a5ec..e3ccb01 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -1,4 +1,5 @@ mod ip; +mod parenthesis_counter; mod parse_link; use nom::{ diff --git a/src/parser/link_url/parenthesis_counter.rs b/src/parser/link_url/parenthesis_counter.rs new file mode 100644 index 0000000..dd0dbf6 --- /dev/null +++ b/src/parser/link_url/parenthesis_counter.rs @@ -0,0 +1,81 @@ +use nom::Slice; + +macro_rules! adjust_balance { + ($a: expr, $b: expr, $c: expr, $d: expr) => { + // for opening ones + { + $a = $a.saturating_add(1); + if $d.slice($c..).find($b).is_none() { + return Some($c); + } + } + }; + ($a: expr, $b: expr) => { + // for closing ones + { + if $a == 0 { + return Some($b); + } else { + $a = $a.saturating_sub(1); + } + } + }; +} + +/// finds unbalanced closing parenthesesis and returns distance to it. +/// unbalanced means it was closed but not opened before in the given string +pub(super) fn count_chars_in_complete_parenthesis(input: &str) -> Option { + let mut parenthes = 0usize; // () + let mut curly_bracket = 0usize; // {} + let mut bracket = 0usize; // [] + let mut angle = 0usize; // <> + + for (i, ch) in input.chars().enumerate() { + match ch { + '(' => { + adjust_balance!(parenthes, ')', i, input); + } + '{' => { + adjust_balance!(curly_bracket, '}', i, input); + } + '[' => { + adjust_balance!(bracket, ']', i, input); + } + '<' => { + adjust_balance!(angle, '>', i, input); + } + ')' => { + adjust_balance!(parenthes, i); + } + ']' => { + adjust_balance!(bracket, i); + } + '}' => { + adjust_balance!(curly_bracket, i); + } + '>' => { + adjust_balance!(angle, i); + } + _ => continue, + } + } + None +} + +#[test] +fn test_count_parenthesis() { + assert_eq!(count_chars_in_complete_parenthesis("{}"), None); + assert_eq!(count_chars_in_complete_parenthesis("{} test"), None); + assert_eq!(count_chars_in_complete_parenthesis("(test) test"), None); + assert_eq!(count_chars_in_complete_parenthesis("(test)) test"), Some(6)); +} + +#[test] +fn test_count_different_types_invalid() { + assert_eq!(count_chars_in_complete_parenthesis("(({(})))"), None); +} + +#[test] +fn test_count_different_types_invalid2() { + assert_eq!(count_chars_in_complete_parenthesis("}(({(})))"), Some(0)); +} diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index dc3635a..3bc58aa 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -22,6 +22,8 @@ use crate::parser::{ }, }; +use super::parenthesis_counter::count_chars_in_complete_parenthesis; + /// determines which generic schemes (without '://') get linkifyed fn is_allowed_generic_scheme(scheme: &str) -> bool { matches!( @@ -272,67 +274,6 @@ fn ifragment(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(tuple((char('#'), take_while_ifragment)))(input) } -macro_rules! link_correct { - ($a: expr, $b: expr, $c: expr, $d: expr) => { - // for opening ones - { - $a = $a.saturating_add(1); - if $d.slice($c..).find($b).is_none() { - return Some($c); - } - } - }; - ($a: expr, $b: expr) => { - // for closing ones - { - if $a == 0 { - return Some($b); - } else { - $a = $a.saturating_sub(1); - } - } - }; -} - -// TODO: better name for this function -fn get_correct_link(link: &str) -> Option { - let mut parenthes = 0usize; // () - let mut curly_bracket = 0usize; // {} - let mut bracket = 0usize; // [] - let mut angle = 0usize; // <> - - for (i, ch) in link.chars().enumerate() { - match ch { - '(' => { - link_correct!(parenthes, ')', i, link); - } - '{' => { - link_correct!(curly_bracket, '}', i, link); - } - '[' => { - link_correct!(bracket, ']', i, link); - } - '<' => { - link_correct!(angle, '>', i, link); - } - ')' => { - link_correct!(parenthes, i); - } - ']' => { - link_correct!(bracket, i); - } - '}' => { - link_correct!(curly_bracket, i); - } - '>' => { - link_correct!(angle, i); - } - _ => continue, - } - } - None -} - fn parse_ipath_abempty(input: &str) -> IResult<&str, &str, CustomError<&str>> { recognize(many0(tuple((char('/'), opt(take_while_ipchar1)))))(input) } @@ -406,7 +347,7 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { host = input_.slice(scheme.len().saturating_add(3)..input_.len().saturating_sub(1)); } } - len = get_correct_link(link).unwrap_or(len); + len = count_chars_in_complete_parenthesis(link).unwrap_or(len); let link = input_.slice(0..len); let input = input_.slice(len..); diff --git a/src/parser/parse_from_text/text_elements.rs b/src/parser/parse_from_text/text_elements.rs index b154ca6..dc6c4da 100644 --- a/src/parser/parse_from_text/text_elements.rs +++ b/src/parser/parse_from_text/text_elements.rs @@ -95,143 +95,6 @@ pub(crate) fn fediverse_address_as_text(input: &str) -> IResult<&str, Element, C Ok((input, Element::Text(consumed))) } -/* -fn not_link_part_char(c: char) -> bool { - !matches!(c, ':' | '\n' | '\r' | '\t' | ' ') -} - -fn link(input: &str) -> IResult<&str, (), CustomError<&str>> { - let (input, _) = take_while1(link_scheme)(input)?; -} - -/// rough recognition of an link, results gets checked by a real link parser -fn link_intern(input: &str) -> IResult<&str, (), CustomError<&str>> { - let (input, _) = take_while1(not_link_part_char)(input)?; - let (input, _) = tag(":")(input)?; - let i = <&str>::clone(&input); - let (remaining, consumed) = take_while1(is_not_white_space)(i)?; - - let mut parentheses_count = 0usize; // () - let mut curly_brackets_count = 0usize; // {} - let mut brackets_count = 0usize; // [] - let mut angle_brackets = 0usize; // <> - - let mut alternative_offset = None; - for (i, char) in consumed.chars().enumerate() { - match char { - '(' => { - parentheses_count = parentheses_count.saturating_add(1); - // if there is no closing bracket in the link, then don't take the bracket as a part of the link - if (<&str>::clone(&consumed)).slice(i..).find(')').is_none() { - alternative_offset = Some(i); - break; - } - } - '{' => { - curly_brackets_count = curly_brackets_count.saturating_add(1); - // if there is no closing bracket in the link, then don't take the bracket as a part of the link - if (<&str>::clone(&consumed)).slice(i..).find('}').is_none() { - alternative_offset = Some(i); - break; - } - } - '[' => { - brackets_count = brackets_count.saturating_add(1); - // if there is no closing bracket in the link, then don't take the bracket as a part of the link - if (<&str>::clone(&consumed)).slice(i..).find(']').is_none() { - alternative_offset = Some(i); - break; - } - } - '<' => { - angle_brackets = angle_brackets.saturating_add(1); - // if there is no closing bracket in the link, then don't take the bracket as a part of the link - if (<&str>::clone(&consumed)).slice(i..).find('>').is_none() { - alternative_offset = Some(i); - break; - } - } - ')' => { - if parentheses_count == 0 { - alternative_offset = Some(i); - break; - } else { - parentheses_count = parentheses_count.saturating_sub(1); - } - } - '}' => { - if curly_brackets_count == 0 { - alternative_offset = Some(i); - break; - } else { - curly_brackets_count = curly_brackets_count.saturating_sub(1); - } - } - ']' => { - if brackets_count == 0 { - alternative_offset = Some(i); - break; - } else { - brackets_count = brackets_count.saturating_sub(1); - } - } - '>' => { - if angle_brackets == 0 { - alternative_offset = Some(i); - break; - } else { - angle_brackets = angle_brackets.saturating_sub(1); - } - } - _ => continue, - } - } - - if let Some(offset) = alternative_offset { - let remaining = input.slice(offset..); - Ok((remaining, ())) - } else { - Ok((remaining, ())) - } -} - -pub(crate) fn link(input: &str) -> IResult<&str, Element, CustomError<&str>> { - // basically - //let (input, content) = recognize(link_intern)(input)?; - // but don't eat the last char if it is one of these: `.,;:` - let i = <&str>::clone(&input); - let i2 = <&str>::clone(&input); - let i3 = <&str>::clone(&input); - let (input, content) = match link_intern(i) { - Ok((remaining, _)) => { - let index = i2.offset(remaining); - let consumed = i2.slice(..index); - match consumed.chars().last() { - Some(c) => match c { - '.' | ',' | ':' | ';' => { - let index = input.offset(remaining).saturating_sub(1); - let consumed = i3.slice(..index); - let remaining = input.slice(index..); - Ok((remaining, consumed)) - } - _ => Ok((remaining, consumed)), - }, - _ => Ok((remaining, consumed)), - } - } - Err(e) => Err(e), - }?; - - // check if result is valid link - let (remainder, destination) = LinkDestination::parse_standalone_with_whitelist(content)?; - - if remainder.is_empty() { - Ok((input, Element::Link { destination })) - } else { - Err(nom::Err::Error(CustomError::InvalidLink)) - } -} -*/ fn is_allowed_bot_cmd_suggestion_char(char: char) -> bool { match char { '@' | '\\' | '_' | '.' | '-' | '/' => true,