From bcb08956c51315a52bfb01b5c6472cd77c645748 Mon Sep 17 00:00:00 2001 From: Farooq Karimi Zadeh Date: Sun, 9 Jun 2024 18:30:21 +0330 Subject: [PATCH] add utilities to determine if a char is unicode whitespace or punctuation --- src/parser/unicode_ranges.rs | 346 +++++++++++++++++++++++++++++++++++ src/parser/utils.rs | 21 +++ 2 files changed, 367 insertions(+) create mode 100644 src/parser/unicode_ranges.rs diff --git a/src/parser/unicode_ranges.rs b/src/parser/unicode_ranges.rs new file mode 100644 index 0000000..03781a4 --- /dev/null +++ b/src/parser/unicode_ranges.rs @@ -0,0 +1,346 @@ +// These ranges are extracted from Unicode DB(as XML) using +// /scripts/extract_punct_chars.py +// NOTE that they are sorted. +// --Farooq fkz riseup.net +// farooqkz testrun.org +pub const UNICODE_WHITESPACE_RANGES: [RangeInclusive; 339] =[ + 0x21..=0x2f, + 0x3a..=0x40, + 0x5b..=0x60, + 0x7b..=0x7e, + 0xa1..=0xa9, + 0xab..=0xac, + 0xae..=0xb1, + 0xb4..=0xb4, + 0xb6..=0xb8, + 0xbb..=0xbb, + 0xbf..=0xbf, + 0xd7..=0xd7, + 0xf7..=0xf7, + 0x2c2..=0x2c5, + 0x2d2..=0x2df, + 0x2e5..=0x2eb, + 0x2ed..=0x2ed, + 0x2ef..=0x2ff, + 0x375..=0x375, + 0x37e..=0x37e, + 0x384..=0x385, + 0x387..=0x387, + 0x3f6..=0x3f6, + 0x482..=0x482, + 0x55a..=0x55f, + 0x589..=0x58a, + 0x58d..=0x58f, + 0x5be..=0x5be, + 0x5c0..=0x5c0, + 0x5c3..=0x5c3, + 0x5c6..=0x5c6, + 0x5f3..=0x5f4, + 0x606..=0x60f, + 0x61b..=0x61b, + 0x61d..=0x61f, + 0x66a..=0x66d, + 0x6d4..=0x6d4, + 0x6de..=0x6de, + 0x6e9..=0x6e9, + 0x6fd..=0x6fe, + 0x700..=0x70d, + 0x7f6..=0x7f9, + 0x7fe..=0x7ff, + 0x830..=0x83e, + 0x85e..=0x85e, + 0x888..=0x888, + 0x964..=0x965, + 0x970..=0x970, + 0x9f2..=0x9f3, + 0x9fa..=0x9fb, + 0x9fd..=0x9fd, + 0xa76..=0xa76, + 0xaf0..=0xaf1, + 0xb70..=0xb70, + 0xbf3..=0xbfa, + 0xc77..=0xc77, + 0xc7f..=0xc7f, + 0xc84..=0xc84, + 0xd4f..=0xd4f, + 0xd79..=0xd79, + 0xdf4..=0xdf4, + 0xe3f..=0xe3f, + 0xe4f..=0xe4f, + 0xe5a..=0xe5b, + 0xf01..=0xf17, + 0xf1a..=0xf1f, + 0xf34..=0xf34, + 0xf36..=0xf36, + 0xf38..=0xf38, + 0xf3a..=0xf3d, + 0xf85..=0xf85, + 0xfbe..=0xfc5, + 0xfc7..=0xfcc, + 0xfce..=0xfda, + 0x104a..=0x104f, + 0x109e..=0x109f, + 0x10fb..=0x10fb, + 0x1360..=0x1368, + 0x1390..=0x1399, + 0x1400..=0x1400, + 0x166d..=0x166e, + 0x169b..=0x169c, + 0x16eb..=0x16ed, + 0x1735..=0x1736, + 0x17d4..=0x17d6, + 0x17d8..=0x17db, + 0x1800..=0x180a, + 0x1940..=0x1940, + 0x1944..=0x1945, + 0x19de..=0x19ff, + 0x1a1e..=0x1a1f, + 0x1aa0..=0x1aa6, + 0x1aa8..=0x1aad, + 0x1b5a..=0x1b6a, + 0x1b74..=0x1b7e, + 0x1bfc..=0x1bff, + 0x1c3b..=0x1c3f, + 0x1c7e..=0x1c7f, + 0x1cc0..=0x1cc7, + 0x1cd3..=0x1cd3, + 0x1fbd..=0x1fbd, + 0x1fbf..=0x1fc1, + 0x1fcd..=0x1fcf, + 0x1fdd..=0x1fdf, + 0x1fed..=0x1fef, + 0x1ffd..=0x1ffe, + 0x2010..=0x2027, + 0x2030..=0x205e, + 0x207a..=0x207e, + 0x208a..=0x208e, + 0x20a0..=0x20c0, + 0x2100..=0x2101, + 0x2103..=0x2106, + 0x2108..=0x2109, + 0x2114..=0x2114, + 0x2116..=0x2118, + 0x211e..=0x2123, + 0x2125..=0x2125, + 0x2127..=0x2127, + 0x2129..=0x2129, + 0x212e..=0x212e, + 0x213a..=0x213b, + 0x2140..=0x2144, + 0x214a..=0x214d, + 0x214f..=0x214f, + 0x218a..=0x218b, + 0x2190..=0x2426, + 0x2440..=0x244a, + 0x249c..=0x24e9, + 0x2500..=0x2775, + 0x2794..=0x2b73, + 0x2b76..=0x2b95, + 0x2b97..=0x2bff, + 0x2ce5..=0x2cea, + 0x2cf9..=0x2cfc, + 0x2cfe..=0x2cff, + 0x2d70..=0x2d70, + 0x2e00..=0x2e2e, + 0x2e30..=0x2e5d, + 0x2e80..=0x2e99, + 0x2e9b..=0x2ef3, + 0x2f00..=0x2fd5, + 0x2ff0..=0x2fff, + 0x3001..=0x3004, + 0x3008..=0x3020, + 0x3030..=0x3030, + 0x3036..=0x3037, + 0x303d..=0x303f, + 0x309b..=0x309c, + 0x30a0..=0x30a0, + 0x30fb..=0x30fb, + 0x3190..=0x3191, + 0x3196..=0x319f, + 0x31c0..=0x31e3, + 0x31ef..=0x31ef, + 0x3200..=0x321e, + 0x322a..=0x3247, + 0x3250..=0x3250, + 0x3260..=0x327f, + 0x328a..=0x32b0, + 0x32c0..=0x33ff, + 0x4dc0..=0x4dff, + 0xa490..=0xa4c6, + 0xa4fe..=0xa4ff, + 0xa60d..=0xa60f, + 0xa673..=0xa673, + 0xa67e..=0xa67e, + 0xa6f2..=0xa6f7, + 0xa700..=0xa716, + 0xa720..=0xa721, + 0xa789..=0xa78a, + 0xa828..=0xa82b, + 0xa836..=0xa839, + 0xa874..=0xa877, + 0xa8ce..=0xa8cf, + 0xa8f8..=0xa8fa, + 0xa8fc..=0xa8fc, + 0xa92e..=0xa92f, + 0xa95f..=0xa95f, + 0xa9c1..=0xa9cd, + 0xa9de..=0xa9df, + 0xaa5c..=0xaa5f, + 0xaa77..=0xaa79, + 0xaade..=0xaadf, + 0xaaf0..=0xaaf1, + 0xab5b..=0xab5b, + 0xab6a..=0xab6b, + 0xabeb..=0xabeb, + 0xfb29..=0xfb29, + 0xfbb2..=0xfbc2, + 0xfd3e..=0xfd4f, + 0xfdcf..=0xfdcf, + 0xfdfc..=0xfdff, + 0xfe10..=0xfe19, + 0xfe30..=0xfe52, + 0xfe54..=0xfe66, + 0xfe68..=0xfe6b, + 0xff01..=0xff0f, + 0xff1a..=0xff20, + 0xff3b..=0xff40, + 0xff5b..=0xff65, + 0xffe0..=0xffe6, + 0xffe8..=0xffee, + 0xfffc..=0xfffd, + 0x10100..=0x10102, + 0x10137..=0x1013f, + 0x10179..=0x10189, + 0x1018c..=0x1018e, + 0x10190..=0x1019c, + 0x101a0..=0x101a0, + 0x101d0..=0x101fc, + 0x1039f..=0x1039f, + 0x103d0..=0x103d0, + 0x1056f..=0x1056f, + 0x10857..=0x10857, + 0x10877..=0x10878, + 0x1091f..=0x1091f, + 0x1093f..=0x1093f, + 0x10a50..=0x10a58, + 0x10a7f..=0x10a7f, + 0x10ac8..=0x10ac8, + 0x10af0..=0x10af6, + 0x10b39..=0x10b3f, + 0x10b99..=0x10b9c, + 0x10ead..=0x10ead, + 0x10f55..=0x10f59, + 0x10f86..=0x10f89, + 0x11047..=0x1104d, + 0x110bb..=0x110bc, + 0x110be..=0x110c1, + 0x11140..=0x11143, + 0x11174..=0x11175, + 0x111c5..=0x111c8, + 0x111cd..=0x111cd, + 0x111db..=0x111db, + 0x111dd..=0x111df, + 0x11238..=0x1123d, + 0x112a9..=0x112a9, + 0x1144b..=0x1144f, + 0x1145a..=0x1145b, + 0x1145d..=0x1145d, + 0x114c6..=0x114c6, + 0x115c1..=0x115d7, + 0x11641..=0x11643, + 0x11660..=0x1166c, + 0x116b9..=0x116b9, + 0x1173c..=0x1173f, + 0x1183b..=0x1183b, + 0x11944..=0x11946, + 0x119e2..=0x119e2, + 0x11a3f..=0x11a46, + 0x11a9a..=0x11a9c, + 0x11a9e..=0x11aa2, + 0x11b00..=0x11b09, + 0x11c41..=0x11c45, + 0x11c70..=0x11c71, + 0x11ef7..=0x11ef8, + 0x11f43..=0x11f4f, + 0x11fd5..=0x11ff1, + 0x11fff..=0x11fff, + 0x12470..=0x12474, + 0x12ff1..=0x12ff2, + 0x16a6e..=0x16a6f, + 0x16af5..=0x16af5, + 0x16b37..=0x16b3f, + 0x16b44..=0x16b45, + 0x16e97..=0x16e9a, + 0x16fe2..=0x16fe2, + 0x1bc9c..=0x1bc9c, + 0x1bc9f..=0x1bc9f, + 0x1cf50..=0x1cfc3, + 0x1d000..=0x1d0f5, + 0x1d100..=0x1d126, + 0x1d129..=0x1d164, + 0x1d16a..=0x1d16c, + 0x1d183..=0x1d184, + 0x1d18c..=0x1d1a9, + 0x1d1ae..=0x1d1ea, + 0x1d200..=0x1d241, + 0x1d245..=0x1d245, + 0x1d300..=0x1d356, + 0x1d6c1..=0x1d6c1, + 0x1d6db..=0x1d6db, + 0x1d6fb..=0x1d6fb, + 0x1d715..=0x1d715, + 0x1d735..=0x1d735, + 0x1d74f..=0x1d74f, + 0x1d76f..=0x1d76f, + 0x1d789..=0x1d789, + 0x1d7a9..=0x1d7a9, + 0x1d7c3..=0x1d7c3, + 0x1d800..=0x1d9ff, + 0x1da37..=0x1da3a, + 0x1da6d..=0x1da74, + 0x1da76..=0x1da83, + 0x1da85..=0x1da8b, + 0x1e14f..=0x1e14f, + 0x1e2ff..=0x1e2ff, + 0x1e95e..=0x1e95f, + 0x1ecac..=0x1ecac, + 0x1ecb0..=0x1ecb0, + 0x1ed2e..=0x1ed2e, + 0x1eef0..=0x1eef1, + 0x1f000..=0x1f02b, + 0x1f030..=0x1f093, + 0x1f0a0..=0x1f0ae, + 0x1f0b1..=0x1f0bf, + 0x1f0c1..=0x1f0cf, + 0x1f0d1..=0x1f0f5, + 0x1f10d..=0x1f1ad, + 0x1f1e6..=0x1f202, + 0x1f210..=0x1f23b, + 0x1f240..=0x1f248, + 0x1f250..=0x1f251, + 0x1f260..=0x1f265, + 0x1f300..=0x1f6d7, + 0x1f6dc..=0x1f6ec, + 0x1f6f0..=0x1f6fc, + 0x1f700..=0x1f776, + 0x1f77b..=0x1f7d9, + 0x1f7e0..=0x1f7eb, + 0x1f7f0..=0x1f7f0, + 0x1f800..=0x1f80b, + 0x1f810..=0x1f847, + 0x1f850..=0x1f859, + 0x1f860..=0x1f887, + 0x1f890..=0x1f8ad, + 0x1f8b0..=0x1f8b1, + 0x1f900..=0x1fa53, + 0x1fa60..=0x1fa6d, + 0x1fa70..=0x1fa7c, + 0x1fa80..=0x1fa88, + 0x1fa90..=0x1fabd, + 0x1fabf..=0x1fac5, + 0x1face..=0x1fadb, + 0x1fae0..=0x1fae8, + 0x1faf0..=0x1faf8, + 0x1fb00..=0x1fb92, + 0x1fb94..=0x1fbca, +]; diff --git a/src/parser/utils.rs b/src/parser/utils.rs index aacbe92..97ccf5f 100644 --- a/src/parser/utils.rs +++ b/src/parser/utils.rs @@ -1,4 +1,5 @@ use std::ops::RangeInclusive; +use super::unicode_ranges::UNICODE_PUNCTUATION_RANGES; #[derive(Debug, PartialEq, Eq)] enum FindRangeResult<'a> { @@ -80,6 +81,26 @@ pub(crate) fn is_white_space(c: char) -> bool { matches!(c, '\n' | '\r' | '\t' | ' ') } +pub(crate) fn is_unicode_whitespace(c: char) -> bool { + is_white_space(c) || + matches!(c, + 0x20 | + 0xa0 | + 0x1680..=0x1680 | + 0x2000..=0x200a | + 0x202f..=0x202f | + 0x205f..=0x205f | + 0x3000..=0x3000) + // These ranges are extracted from unicode DB using + // the script /scripts/extract_unicode_whitespace_ranges.py + // -- Farooq fkz riseup.net + // farooqkz testrun.org +} + +pub(crate) fn is_unicode_punctutation(c: char) -> bool { + is_in_one_of_ranges(c as u32, UNICODE_PUNCTUATION_RANGES[..]) +} + pub(crate) fn is_not_white_space(c: char) -> bool { !is_white_space(c) }