perl/perl-extended-charclass-assert.diff

--- ./pod/perldiag.pod.orig	2017-07-18 23:00:00.000000000 +0000
+++ ./pod/perldiag.pod	2018-11-21 13:46:06.998215887 +0000
@@ -5904,7 +5904,7 @@ yourself.
 a perl4 interpreter, especially if the next 2 tokens are "use strict"
 or "my $var" or "our $var".

-=item Syntax error in (?[...]) in regex m/%s/
+=item Syntax error in (?[...]) in regex; marked by <-- HERE in m/%s/

 (F) Perl could not figure out what you meant inside this construct; this
 notifies you that it is giving up trying.
@@ -6402,6 +6402,31 @@ to find out why that isn't happening.
 (F) The unexec() routine failed for some reason.  See your local FSF
 representative, who probably put it there in the first place.

+=item Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/%s/
+
+(F) While parsing an extended character class a ']' character was encountered
+at a point in the definition where the only legal use of ']' is to close the
+character class definition as part of a '])', you may have forgotten the close
+paren, or otherwise confused the parser.
+
+=item Expecting close paren for nested extended charclass in regex; marked by <-- HERE in m/%s/
+
+(F) While parsing a nested extended character class like:
+
+    (?[ ... (?flags:(?[ ... ])) ... ])
+                             ^
+
+we expected to see a close paren ')' (marked by ^) but did not.
+
+=item Expecting close paren for wrapper for nested extended charclass in regex; marked by <-- HERE in m/%s/
+
+(F) While parsing a nested extended character class like:
+
+    (?[ ... (?flags:(?[ ... ])) ... ])
+                              ^
+
+we expected to see a close paren ')' (marked by ^) but did not.
+
 =item Unexpected binary operator '%c' with no preceding operand in regex;
 marked by S<<-- HERE> in m/%s/

--- ./pod/perlrecharclass.pod.orig	2017-07-18 22:50:12.000000000 +0000
+++ ./pod/perlrecharclass.pod	2018-11-21 13:46:06.998215887 +0000
@@ -1128,8 +1128,8 @@ hence both of the following work:
 Any contained POSIX character classes, including things like C<\w> and C<\D>
 respect the C<E<sol>a> (and C<E<sol>aa>) modifiers.

-C<< (?[ ]) >> is a regex-compile-time construct.  Any attempt to use
-something which isn't knowable at the time the containing regular
+Note that C<< (?[ ]) >> is a regex-compile-time construct.  Any attempt
+to use something which isn't knowable at the time the containing regular
 expression is compiled is a fatal error.  In practice, this means
 just three limitations:

--- ./regcomp.c.orig	2018-11-21 13:45:49.814256819 +0000
+++ ./regcomp.c	2018-11-21 13:46:07.002215878 +0000
@@ -14820,8 +14820,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *
                                     TRUE /* Force /x */ );

             switch (*RExC_parse) {
-                case '?':
-                    if (RExC_parse[1] == '[') depth++, RExC_parse++;
+                case '(':
+                    if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
+                        depth++, RExC_parse+=2;
                     /* FALLTHROUGH */
                 default:
                     break;
@@ -14878,9 +14879,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *
                 }

                 case ']':
-                    if (depth--) break;
-                    RExC_parse++;
-                    if (*RExC_parse == ')') {
+                    if (RExC_parse[1] == ')') {
+                        RExC_parse++;
+                        if (depth--) break;
                         node = reganode(pRExC_state, ANYOF, 0);
                         RExC_size += ANYOF_SKIP;
                         nextchar(pRExC_state);
@@ -14892,20 +14893,25 @@ S_handle_regex_sets(pTHX_ RExC_state_t *

                         return node;
                     }
-                    goto no_close;
+                    /* We output the messages even if warnings are off, because we'll fail
+                     * the very next thing, and these give a likely diagnosis for that */
+                    if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
+                        output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
+                    }
+                    RExC_parse++;
+                    vFAIL("Unexpected ']' with no following ')' in (?[...");
             }

             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
         }

-      no_close:
         /* We output the messages even if warnings are off, because we'll fail
          * the very next thing, and these give a likely diagnosis for that */
         if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
             output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
         }

-        FAIL("Syntax error in (?[...])");
+        vFAIL("Syntax error in (?[...])");
     }

     /* Pass 2 only after this. */
@@ -15085,12 +15091,14 @@ redo_curchar:
                      * inversion list, and RExC_parse points to the trailing
                      * ']'; the next character should be the ')' */
                     RExC_parse++;
-                    assert(UCHARAT(RExC_parse) == ')');
+                    if (UCHARAT(RExC_parse) != ')')
+                        vFAIL("Expecting close paren for nested extended charclass");

                     /* Then the ')' matching the original '(' handled by this
                      * case: statement */
                     RExC_parse++;
-                    assert(UCHARAT(RExC_parse) == ')');
+                    if (UCHARAT(RExC_parse) != ')')
+                        vFAIL("Expecting close paren for wrapper for nested extended charclass");

                     RExC_parse++;
                     RExC_flags = save_flags;
--- ./t/lib/warnings/regcomp.orig	2017-07-18 23:00:00.000000000 +0000
+++ ./t/lib/warnings/regcomp	2018-11-21 13:46:07.002215878 +0000
@@ -59,21 +59,21 @@ Unmatched [ in regex; marked by <-- HERE
 qr/(?[[[:word]]])/;
 EXPECT
 Assuming NOT a POSIX class since there is no terminating ':' in regex; marked by <-- HERE in m/(?[[[:word <-- HERE ]]])/ at - line 2.
-syntax error in (?[...]) in regex m/(?[[[:word]]])/ at - line 2.
+Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/(?[[[:word]] <-- HERE ])/ at - line 2.
 ########
 # NAME qr/(?[ [[:digit: ])/
 # OPTION fatal
 qr/(?[[[:digit: ])/;
 EXPECT
 Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[[:digit: ] <-- HERE )/ at - line 2.
-syntax error in (?[...]) in regex m/(?[[[:digit: ])/ at - line 2.
+syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[[:digit: ]) <-- HERE / at - line 2.
 ########
 # NAME qr/(?[ [:digit: ])/
 # OPTION fatal
 qr/(?[[:digit: ])/
 EXPECT
 Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[:digit: ] <-- HERE )/ at - line 2.
-syntax error in (?[...]) in regex m/(?[[:digit: ])/ at - line 2.
+syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[:digit: ]) <-- HERE / at - line 2.
 ########
 # NAME [perl #126141]
 # OPTION fatal
--- ./t/re/reg_mesg.t.orig	2017-09-07 19:27:40.000000000 +0000
+++ ./t/re/reg_mesg.t	2018-11-21 13:46:07.002215878 +0000
@@ -213,8 +213,9 @@ my @death =
  '/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/",
  '/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/",

- '/(?[[[::]]])/' => "Syntax error in (?[...]) in regex m/(?[[[::]]])/",
- '/(?[[[:w:]]])/' => "Syntax error in (?[...]) in regex m/(?[[[:w:]]])/",
+
+ '/(?[[[::]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[::]]{#}])/",
+ '/(?[[[:w:]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[:w:]]{#}])/",
  '/(?[[:w:]])/' => "",
  '/([.].*)[.]/'   => "",    # [perl #127582]
  '/[.].*[.]/'     => "",    # [perl #127604]
@@ -237,11 +238,12 @@ my @death =
  '/(?[ \p{foo} ])/' => 'Can\'t find Unicode property definition "foo" {#} m/(?[ \p{foo}{#} ])/',
  '/(?[ \p{ foo = bar } ])/' => 'Can\'t find Unicode property definition "foo = bar" {#} m/(?[ \p{ foo = bar }{#} ])/',
  '/(?[ \8 ])/' => 'Unrecognized escape \8 in character class {#} m/(?[ \8{#} ])/',
- '/(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ]/',
- '/(?[ [ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ \t ]/',
- '/(?[ \t ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ] ]/',
- '/(?[ [ ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ ] ]/',
- '/(?[ \t + \e # This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # This was supposed to be a comment ])/',
+ '/(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#}/",
+ '/(?[ [ \t ]/' => "Syntax error in (?[...]) {#} m/(?[ [ \\t ]{#}/",
+ '/(?[ \t ] ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#} ]/",
+ '/(?[ [ ] ]/' => "Syntax error in (?[...]) {#} m/(?[ [ ] ]{#}/",
+ '/(?[ \t + \e # This was supposed to be a comment ])/' =>
+    "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # This was supposed to be a comment ]){#}/",
  '/(?[ ])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[ {#}])/',
  'm/(?[[a-\d]])/' => 'False [] range "a-\d" {#} m/(?[[a-\d{#}]])/',
  'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/',
@@ -427,10 +429,10 @@ my @death_utf8 = mark_as_utf8(

  '/ネ\p{}ネ/' => 'Empty \p{} {#} m/ネ\p{{#}}ネ/',

- '/ネ(?[[[:ネ]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ]]])ネ/",
- '/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ: ])ネ/",
- '/ネ(?[[[::]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[::]]])ネ/",
- '/ネ(?[[[:ネ:]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ:]]])ネ/",
+ '/ネ(?[[[:ネ]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ]]{#}])ネ/",
+ '/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) {#} m/ネ(?[[[:ネ: ])ネ{#}/",
+ '/ネ(?[[[::]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[::]]{#}])ネ/",
+ '/ネ(?[[[:ネ:]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ:]]{#}])ネ/",
  '/ネ(?[[:ネ:]])ネ/' => "",
  '/ネ(?[ネ])ネ/' =>  'Unexpected character {#} m/ネ(?[ネ{#}])ネ/',
  '/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/',
@@ -443,8 +445,9 @@ my @death_utf8 = mark_as_utf8(
  '/(?[ \x{ネ} ])ネ/' => 'Non-hex character {#} m/(?[ \x{ネ{#}} ])ネ/',
  '/(?[ \p{ネ} ])/' => 'Can\'t find Unicode property definition "ネ" {#} m/(?[ \p{ネ}{#} ])/',
  '/(?[ \p{ ネ = bar } ])/' => 'Can\'t find Unicode property definition "ネ = bar" {#} m/(?[ \p{ ネ = bar }{#} ])/',
- '/ネ(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/ネ(?[ \t ]/',
- '/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/',
+ '/ネ(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[ \\t ]{#}/",
+ '/(?[ \t + \e # ネ This was supposed to be a comment ])/' =>
+    "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # ネ This was supposed to be a comment ]){#}/",
  'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
  '/\cネ/' => "Character following \"\\c\" must be printable ASCII",
  '/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",
--- ./t/re/regex_sets.t.orig	2017-07-18 22:50:18.000000000 +0000
+++ ./t/re/regex_sets.t	2018-11-21 13:46:07.002215878 +0000
@@ -158,13 +158,13 @@ for my $char ("٠", "٥", "٩") {
 	eval { $_ = '/(?[(\c]) /'; qr/$_/ };
 	like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
 	eval { $_ = '(?[\c#]' . "\n])"; qr/$_/ };
-	like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
+	like($@, qr/^Unexpected/, '/(?[(\c]) / should not panic');
 	eval { $_ = '(?[(\c])'; qr/$_/ };
 	like($@, qr/^Syntax error/, '/(?[(\c])/ should be a syntax error');
 	eval { $_ = '(?[(\c]) ]\b'; qr/$_/ };
-	like($@, qr/^Syntax error/, '/(?[(\c]) ]\b/ should be a syntax error');
+	like($@, qr/^Unexpected/, '/(?[(\c]) ]\b/ should be a syntax error');
 	eval { $_ = '(?[\c[]](])'; qr/$_/ };
-	like($@, qr/^Syntax error/, '/(?[\c[]](])/ should be a syntax error');
+	like($@, qr/^Unexpected/, '/(?[\c[]](])/ should be a syntax error');
 	like("\c#", qr/(?[\c#])/, '\c# should match itself');
 	like("\c[", qr/(?[\c[])/, '\c[ should match itself');
 	like("\c\ ", qr/(?[\c\])/, '\c\ should match itself');