This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 17/17] Regex: Implement Rational Range Interpretation.
- From: Arnold Robbins <arnold at skeeve dot com>
- To: carlos at redhat dot com, libc-alpha at sourceware dot org
- Date: Fri, 08 Dec 2017 11:19:24 +0200
- Subject: [PATCH 17/17] Regex: Implement Rational Range Interpretation.
- Authentication-results: sourceware.org; auth=none
This patch implements Rational Range Interpretation. For some history
and discussion, see the gawk manual:
https://www.gnu.org/software/gawk/manual/html_node/Ranges-and-Locales.html.
2017-11-30 Arnold D. Robbins <arnold@skeeve.com>
Implement Rational Range Interpretation.
* posix/regcomp.c (build_range_exp): Pass in the syntax bits.
Use it to check RE_NO_EMPTY_RANGES, and check wide char values.
Remove use of wscoll to determine range start and end.
(parse_bracket_exp): Pass the syntax bits to build_range_exp.
* posix/regexec.c (check_node_accept_bytes): Don't use wscoll
to check ranges, but rather wide character values.
diff --git a/posix/regcomp.c b/posix/regcomp.c
index e63c258..0005fe7 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -2654,11 +2654,12 @@ parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
static reg_errcode_t
# ifdef RE_ENABLE_I18N
-build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
- bracket_elem_t *start_elem, bracket_elem_t *end_elem)
-# else /* not RE_ENABLE_I18N */
-build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
+build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset,
+ int *range_alloc, bracket_elem_t *start_elem,
bracket_elem_t *end_elem)
+# else /* not RE_ENABLE_I18N */
+build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
+ bracket_elem_t *start_elem, bracket_elem_t *end_elem)
# endif /* not RE_ENABLE_I18N */
{
unsigned int start_ch, end_ch;
@@ -2681,7 +2682,6 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
wchar_t wc;
wint_t start_wc;
wint_t end_wc;
- wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2695,9 +2695,7 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
? __btowc (end_ch) : end_elem->opr.wch);
if (start_wc == WEOF || end_wc == WEOF)
return REG_ECOLLATE;
- cmp_buf[0] = start_wc;
- cmp_buf[4] = end_wc;
- if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
+ else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
return REG_ERANGE;
/* Got valid collation sequence values, add them as a new entry.
@@ -2745,9 +2743,7 @@ build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
/* Build the table for single byte characters. */
for (wc = 0; wc < SBC_MAX; ++wc)
{
- cmp_buf[2] = wc;
- if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
- && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+ if (start_wc <= wc && wc <= end_wc)
bitset_set (sbcset, wc);
}
}
@@ -3190,15 +3186,15 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
token_len = peek_token_bracket (token, regexp, syntax);
#ifdef _LIBC
- *err = build_range_exp (sbcset, mbcset, &range_alloc,
+ *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc,
&start_elem, &end_elem);
#else
# ifdef RE_ENABLE_I18N
- *err = build_range_exp (sbcset,
+ *err = build_range_exp (syntax, sbcset,
dfa->mb_cur_max > 1 ? mbcset : NULL,
&range_alloc, &start_elem, &end_elem);
# else
- *err = build_range_exp (sbcset, &start_elem, &end_elem);
+ *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
# endif
#endif /* RE_ENABLE_I18N */
if (BE (*err != REG_NOERROR, 0))
diff --git a/posix/regexec.c b/posix/regexec.c
index dcdd33b..a9ed91f 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -3885,18 +3885,10 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
# endif /* _LIBC */
{
/* match with range expression? */
-#if __GNUC__ >= 2
- wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
-#else
- wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
- cmp_buf[2] = wc;
-#endif
for (i = 0; i < cset->nranges; ++i)
{
- cmp_buf[0] = cset->range_starts[i];
- cmp_buf[4] = cset->range_ends[i];
- if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
- && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+ if (cset->range_starts[i] <= wc
+ && wc <= cset->range_ends[i])
{
match_len = char_len;
goto check_node_accept_bytes_match;