[Bug #13671] Fix that "ss" in look-behind causes syntax error
authorK.Takata <[email protected]>
Fri, 25 Jan 2019 09:54:41 +0000 (25 18:54 +0900)
committerNobuyoshi Nakada <[email protected]>
Fri, 31 Oct 2025 11:49:59 +0000 (31 20:49 +0900)
Fixes k-takata/Onigmo#92.

This fix was ported from oniguruma:
https://github.com/kkos/oniguruma/commit/257082dac8c6019198b56324012f0bd1830ff4ba

https://github.com/k-takata/Onigmo/commit/b1a5445fbeba97b3e94a733c2ce11c033453af73

regcomp.c
spec/ruby/language/regexp_spec.rb
test/ruby/test_regexp.rb

index 12ad5d7..22dbe5f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3301,6 +3301,14 @@ setup_subexp_call(Node* node, ScanEnv* env)
 }
 #endif
 
+#define IN_ALT          (1<<0)
+#define IN_NOT          (1<<1)
+#define IN_REPEAT       (1<<2)
+#define IN_VAR_REPEAT   (1<<3)
+#define IN_CALL         (1<<4)
+#define IN_RECCALL      (1<<5)
+#define IN_LOOK_BEHIND  (1<<6)
+
 /* divide different length alternatives in look-behind.
   (?<=A|B) ==> (?<=A)|(?<=B)
   (?<!A|B) ==> (?<!A)(?<!B)
@@ -3597,24 +3605,29 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
   return ONIGERR_MEMORY;
 }
 
-static int
-expand_case_fold_string(Node* node, regex_t* reg)
-{
 #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION  8
 
+static int
+expand_case_fold_string(Node* node, regex_t* reg, int state)
+{
   int r, n, len, alt_num;
   int varlen = 0;
+  int is_in_look_behind;
   UChar *start, *end, *p;
   Node *top_root, *root, *snode, *prev_node;
   OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
-  StrNode* sn = NSTR(node);
+  StrNode* sn;
 
   if (NSTRING_IS_AMBIG(node)) return 0;
 
+  sn = NSTR(node);
+
   start = sn->s;
   end   = sn->end;
   if (start >= end) return 0;
 
+  is_in_look_behind = (state & IN_LOOK_BEHIND) != 0;
+
   r = 0;
   top_root = root = prev_node = snode = NULL_NODE;
   alt_num = 1;
@@ -3630,7 +3643,7 @@ expand_case_fold_string(Node* node, regex_t* reg)
     len = enclen(reg->enc, p, end);
 
     varlen = is_case_fold_variable_len(n, items, len);
-    if (n == 0 || varlen == 0) {
+    if (n == 0 || varlen == 0 || is_in_look_behind) {
       if (IS_NULL(snode)) {
         if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
           onig_node_free(top_root);
@@ -3889,13 +3902,6 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env)
 }
 #endif
 
-#define IN_ALT        (1<<0)
-#define IN_NOT        (1<<1)
-#define IN_REPEAT     (1<<2)
-#define IN_VAR_REPEAT (1<<3)
-#define IN_CALL       (1<<4)
-#define IN_RECCALL    (1<<5)
-
 /* setup_tree does the following work.
  1. check empty loop. (set qn->target_empty_info)
  2. expand ignore-case in char class.
@@ -3937,7 +3943,7 @@ restart:
 
   case NT_STR:
     if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
-      r = expand_case_fold_string(node, reg);
+      r = expand_case_fold_string(node, reg, state);
     }
     break;
 
@@ -4180,7 +4186,7 @@ restart:
           if (r < 0) return r;
           if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
           if (NTYPE(node) != NT_ANCHOR) goto restart;
-          r = setup_tree(an->target, reg, state, env);
+          r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env);
           if (r != 0) return r;
           r = setup_look_behind(node, reg, env);
         }
@@ -4193,7 +4199,8 @@ restart:
           if (r < 0) return r;
           if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
           if (NTYPE(node) != NT_ANCHOR) goto restart;
-          r = setup_tree(an->target, reg, (state | IN_NOT), env);
+          r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND),
+                         env);
           if (r != 0) return r;
           r = setup_look_behind(node, reg, env);
         }
index 0cd9584..dbf341b 100644 (file)
@@ -112,7 +112,7 @@ describe "Literal Regexps" do
     /foo.(?<=\d)/.match("fooA foo1").to_a.should == ["foo1"]
   end
 
-  ruby_bug "#13671", ""..."3.6" do # https://bugs.ruby-lang.org/issues/13671
+  ruby_bug "#13671", ""..."3.5" do # https://bugs.ruby-lang.org/issues/13671
     it "handles a lookbehind with ss characters" do
       r =  Regexp.new("(?<!dss)", Regexp::IGNORECASE)
       r.should =~ "✨"
index 9f1e03e..b69c148 100644 (file)
@@ -1743,6 +1743,29 @@ class TestRegexp < Test::Unit::TestCase
     assert_raise(RegexpError, bug12418){ Regexp.new('(0?0|(?(5)||)|(?(5)||))?') }
   end
 
+  def test_ss_in_look_behind
+    assert_match_at("(?i:ss)", "ss", [[0, 2]])
+    assert_match_at("(?i:ss)", "Ss", [[0, 2]])
+    assert_match_at("(?i:ss)", "SS", [[0, 2]])
+    assert_match_at("(?i:ss)", "\u017fS", [[0, 2]])  # LATIN SMALL LETTER LONG S
+    assert_match_at("(?i:ss)", "s\u017f", [[0, 2]])
+    assert_match_at("(?i:ss)", "\u00df", [[0, 1]])   # LATIN SMALL LETTER SHARP S
+    assert_match_at("(?i:ss)", "\u1e9e", [[0, 1]])   # LATIN CAPITAL LETTER SHARP S
+    assert_match_at("(?i:xssy)", "xssy", [[0, 4]])
+    assert_match_at("(?i:xssy)", "xSsy", [[0, 4]])
+    assert_match_at("(?i:xssy)", "xSSy", [[0, 4]])
+    assert_match_at("(?i:xssy)", "x\u017fSy", [[0, 4]])
+    assert_match_at("(?i:xssy)", "xs\u017fy", [[0, 4]])
+    assert_match_at("(?i:xssy)", "x\u00dfy", [[0, 3]])
+    assert_match_at("(?i:xssy)", "x\u1e9ey", [[0, 3]])
+    assert_match_at("(?i:\u00df)", "ss", [[0, 2]])
+    assert_match_at("(?i:\u00df)", "SS", [[0, 2]])
+    assert_match_at("(?i:[\u00df])", "ss", [[0, 2]])
+    assert_match_at("(?i:[\u00df])", "SS", [[0, 2]])
+    assert_match_at("(?i)(?<!ss)\u2728", "qq\u2728", [[2, 3]])     # Issue #92
+    assert_match_at("(?i)(?<!xss)\u2728", "qq\u2728", [[2, 3]])
+  end
+
   def test_options_in_look_behind
     assert_nothing_raised {
       assert_match_at("(?<=(?i)ab)cd", "ABcd", [[2,4]])