Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

<regex>: Implement collating ranges #5238

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 55 additions & 30 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1720,7 +1720,7 @@ public:

private:
// lexing
void _Error(regex_constants::error_type);
[[noreturn]] void _Error(regex_constants::error_type);

bool _Is_esc() const;
void _Trans();
Expand Down Expand Up @@ -2917,7 +2917,8 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range2(const _Elem _Arg0, const _E

_Node->_Small->_Mark(_Ex0);
}
if (_Ex1 >= _Ex0) {

if (_Flags & regex_constants::collate || _Ex1 >= _Ex0) {
Copy link
Contributor Author

@muellerj2 muellerj2 Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The control flow is subtle around here but the function ends up doing the right thing: When the collate flag is set, the _Builder constructor sets the member variables such that _Get_tmax() and _Get_bmax() return 0. This means that the preceding loop is skipped and the inner if condition is false (even if _Ex1 < _Ex0, because _Ex1 - _Ex0 computes an unsigned integer). So in the end, the code just adds the bounds to _Node->_Ranges.

if (_Ex1 - _Ex0 < _Get_tmax()) {
for (; _Ex0 <= _Ex1; ++_Ex0) {
_Add_char_to_array(static_cast<_Elem>(_Ex0));
Expand Down Expand Up @@ -3355,6 +3356,20 @@ bool _Lookup_range(unsigned int _Ch, const _Buf<_Elem>* _Bufptr) { // check whet
return false;
}

template <class _Elem, class _RxTraits>
bool _Lookup_collating_range(_Elem _Ch, const _Buf<_Elem>* _Bufptr, const _RxTraits& _Traits) {
typename _RxTraits::string_type _Str = _Traits.transform(_STD addressof(_Ch), _STD addressof(_Ch) + 1);
for (unsigned int _Ix = 0; _Ix < _Bufptr->_Size(); _Ix += 2) { // check current position
const _Elem _Left = _Bufptr->_At(_Ix);
const _Elem _Right = _Bufptr->_At(_Ix + 1);
if (_Traits.transform(_STD addressof(_Left), _STD addressof(_Left) + 1) <= _Str
&& _Str <= _Traits.transform(_STD addressof(_Right), _STD addressof(_Right) + 1)) {
return true;
}
}
return false;
}

template <class _Elem, class _RxTraits>
bool _Lookup_equiv(typename _RxTraits::_Uelem _Ch, const _Sequence<_Elem>* _Eq, const _RxTraits& _Traits) {
// check whether _Ch is in _Eq
Expand Down Expand Up @@ -3398,35 +3413,36 @@ _BidIt _Lookup_coll(_BidIt _First, _BidIt _Last, const _Sequence<_Elem>* _Eq) {
template <class _BidIt, class _Elem, class _RxTraits, class _It>
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // apply bracket expression
bool _Found;
auto _Ch = static_cast<typename _RxTraits::_Uelem>(*_Tgt_state._Cur);
_Elem _Ch = *_Tgt_state._Cur;
if (_Sflags & regex_constants::icase) {
_Ch = static_cast<typename _RxTraits::_Uelem>(_Traits.translate_nocase(static_cast<_Elem>(_Ch)));
_Ch = _Traits.translate_nocase(_Ch);
} else if (_Sflags & regex_constants::collate) {
_Ch = _Traits.translate(_Ch);
}
auto _UCh = static_cast<typename _RxTraits::_Uelem>(_Ch);

_It _Res0 = _Tgt_state._Cur;
++_Res0;
_It _Resx;
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx);
if (_Node->_Coll
&& (_Resx = _Lookup_coll(_Tgt_state._Cur, _End, _Node->_Coll))
&& (_Resx = _STD _Lookup_coll(_Tgt_state._Cur, _End, _Node->_Coll))
!= _Tgt_state._Cur) { // check for collation element
_Res0 = _Resx;
_Found = true;
} else if (_Node->_Ranges
&& (_Lookup_range(static_cast<typename _RxTraits::_Uelem>(
_Sflags & regex_constants::collate ? _Traits.translate(static_cast<_Elem>(_Ch))
: static_cast<_Elem>(_Ch)),
_Node->_Ranges))) {
&& (_Sflags & regex_constants::collate ? _STD _Lookup_collating_range(_Ch, _Node->_Ranges, _Traits)
: _STD _Lookup_range(_UCh, _Node->_Ranges))) {
_Found = true;
} else if (_Ch < _Bmp_max) {
_Found = _Node->_Small && _Node->_Small->_Find(_Ch);
} else if (_UCh < _Bmp_max) {
_Found = _Node->_Small && _Node->_Small->_Find(_UCh);
} else if (_Node->_Large
&& _STD find(_Node->_Large->_Str(), _Node->_Large->_Str() + _Node->_Large->_Size(), _Ch)
!= _Node->_Large->_Str() + _Node->_Large->_Size()) {
_Found = true;
} else if (_Node->_Classes != 0 && _Traits.isctype(static_cast<_Elem>(_Ch), _Node->_Classes)) {
} else if (_Node->_Classes != typename _RxTraits::char_class_type{} && _Traits.isctype(_Ch, _Node->_Classes)) {
_Found = true;
} else if (_Node->_Equiv && _Lookup_equiv(_Ch, _Node->_Equiv, _Traits)) {
} else if (_Node->_Equiv && _STD _Lookup_equiv(_UCh, _Node->_Equiv, _Traits)) {
_Found = true;
} else {
_Found = false;
Expand Down Expand Up @@ -3746,34 +3762,36 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt
case _N_class:
{ // check for string match
for (; _First_arg != _Last; ++_First_arg) { // look for starting match
using _Uelem = typename _RxTraits::_Uelem;
bool _Found;
auto _Ch = static_cast<_Uelem>(*_First_arg);
_Elem _Ch = *_First_arg;
if (_Sflags & regex_constants::icase) {
_Ch = _Traits.translate_nocase(_Ch);
} else if (_Sflags & regex_constants::collate) {
_Ch = _Traits.translate(_Ch);
}
auto _UCh = static_cast<typename _RxTraits::_Uelem>(_Ch);

_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx);
_It _Next = _First_arg;
++_Next;

if (_Sflags & regex_constants::icase) {
_Ch = static_cast<_Uelem>(_Traits.translate_nocase(static_cast<_Elem>(_Ch)));
}

if (_Node->_Coll && _Lookup_coll(_First_arg, _Next, _Node->_Coll) != _First_arg) {
if (_Node->_Coll && _STD _Lookup_coll(_First_arg, _Next, _Node->_Coll) != _First_arg) {
_Found = true;
} else if (_Node->_Ranges
&& (_Lookup_range(static_cast<_Uelem>(_Sflags & regex_constants::collate
? _Traits.translate(static_cast<_Elem>(_Ch))
: static_cast<_Elem>(_Ch)),
_Node->_Ranges))) {
&& (_Sflags & regex_constants::collate
? _STD _Lookup_collating_range(_Ch, _Node->_Ranges, _Traits)
: _STD _Lookup_range(_UCh, _Node->_Ranges))) {
_Found = true;
} else if (_Ch < _Bmp_max) {
_Found = _Node->_Small && _Node->_Small->_Find(_Ch);
} else if (_UCh < _Bmp_max) {
_Found = _Node->_Small && _Node->_Small->_Find(_UCh);
} else if (_Node->_Large
&& _STD find(_Node->_Large->_Str(), _Node->_Large->_Str() + _Node->_Large->_Size(), _Ch)
!= _Node->_Large->_Str() + _Node->_Large->_Size()) {
_Found = true;
} else if (_Node->_Classes && _Traits.isctype(static_cast<_Elem>(_Ch), _Node->_Classes)) {
} else if (_Node->_Classes != typename _RxTraits::char_class_type{}
&& _Traits.isctype(_Ch, _Node->_Classes)) {
_Found = true;
} else if (_Node->_Equiv && _Lookup_equiv(_Ch, _Node->_Equiv, _Traits)) {
} else if (_Node->_Equiv && _STD _Lookup_equiv(_UCh, _Node->_Equiv, _Traits)) {
_Found = true;
} else {
_Found = false;
Expand Down Expand Up @@ -3842,7 +3860,7 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt
}

template <class _FwdIt, class _Elem, class _RxTraits>
void _Parser<_FwdIt, _Elem, _RxTraits>::_Error(regex_constants::error_type _Code) { // handle error
[[noreturn]] void _Parser<_FwdIt, _Elem, _RxTraits>::_Error(regex_constants::error_type _Code) { // handle error
_Xregex_error(_Code);
}

Expand Down Expand Up @@ -4156,7 +4174,14 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_ClassRanges() { // check for valid clas
_Chr2 = _Traits.translate(_Chr2);
}

if (static_cast<typename _RxTraits::_Uelem>(_Chr2) < static_cast<typename _RxTraits::_Uelem>(_Chr1)) {
if (_Flags & regex_constants::collate) {
const _Elem* const _Chr1_ptr = _STD addressof(_Chr1);
const _Elem* const _Chr2_ptr = _STD addressof(_Chr2);
if (_Traits.transform(_Chr2_ptr, _Chr2_ptr + 1) < _Traits.transform(_Chr1_ptr, _Chr1_ptr + 1)) {
_Error(regex_constants::error_range);
}
} else if (static_cast<typename _RxTraits::_Uelem>(_Chr2)
< static_cast<typename _RxTraits::_Uelem>(_Chr1)) {
_Error(regex_constants::error_range);
}

Expand Down
4 changes: 4 additions & 0 deletions tests/std/tests/GH_005204_regex_collating_ranges/env.lst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

RUNALL_INCLUDE ..\usual_matrix.lst
Loading