Re: Assert triggered during RE_compile_and_cache

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Re: Assert triggered during RE_compile_and_cache
Дата
Msg-id 2822987.1628372679@sss.pgh.pa.us
обсуждение исходный текст
Ответ на Re: Assert triggered during RE_compile_and_cache  (Tom Lane <tgl@sss.pgh.pa.us>)
Ответы Re: Assert triggered during RE_compile_and_cache
Список pgsql-hackers
I wrote:
>> Hm.  I thought that this might be an old issue, but I'm not seeing the
>> crash in pre-v14 branches.  That means it's some bug introduced in
>> the performance improvements I did a few months ago.  Open item added.

> git bisect says the trouble started with
> ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4 is the first bad commit

Here's a draft fix for this.  The issue seems to be that parseqatom
records where a previous capture group is by saving a pointer to
the "subre" that parse() returned for it.  That was okay before
said commit because we didn't do anything further to that subre:
we immediately wrapped it into a parent subre, and then any further
hacking that parseqatom did was done on the parent subre.  But after
ea1268f63, in most cases we'd hack right on that same subre, thus
potentially modifying the portion of the NFA that would be copied
by a subsequent back-reference.

The particular thing that's asserting when you wrap {0} around such
a back-ref is just accidental road kill: it happens to notice that
the NFA is no longer consistent, because there's no path leading
from the start to the end of the copied sub-NFA, thanks to the copying
having been done between a pair of states that weren't actually
appropriate to reference.  What surprises me about this is not that
crash, but that we didn't see fundamental breakage of backref-using
patterns all over the place.  It evidently breaks only in corner
cases, but I'm not quite sure why the effects are so limited.

Anyway, the fix seems pretty straightforward.  We don't really need
anything about the subre as such except for its starting and ending
NFA states, so the attached modifies things to record only those
state pointers.  I'm not done testing this by any means, but it
does fix the two cases you showed, and I've been running that perl
script for some time with no further crashes.

I suspect the assertion you hit with the REG_NOSUB patch was just
further fallout from this same basic bug, but I've not yet rebased
that patch over this to check.

            regards, tom lane

diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 9f71177d31..afa6dd44c6 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -233,6 +233,13 @@ static int    cmp(const chr *, const chr *, size_t);
 static int    casecmp(const chr *, const chr *, size_t);


+/* info we need during compilation about a known capturing subexpression */
+struct subinfo
+{
+    struct state *left;            /* left end of its sub-NFA */
+    struct state *right;        /* right end of its sub-NFA */
+};
+
 /* internal variables, bundled for easy passing around */
 struct vars
 {
@@ -245,10 +252,10 @@ struct vars
     int            nexttype;        /* type of next token */
     chr            nextvalue;        /* value (if any) of next token */
     int            lexcon;            /* lexical context type (see regc_lex.c) */
-    int            nsubexp;        /* subexpression count */
-    struct subre **subs;        /* subRE pointer vector */
-    size_t        nsubs;            /* length of vector */
-    struct subre *sub10[10];    /* initial vector, enough for most */
+    int            nsubexp;        /* number of known capturing subexpressions */
+    struct subinfo *subs;        /* info about known capturing subexpressions */
+    size_t        nsubs;            /* allocated length of subs[] vector */
+    struct subinfo sub10[10];    /* initial vector, enough for most */
     struct nfa *nfa;            /* the NFA */
     struct colormap *cm;        /* character color map */
     color        nlcolor;        /* color of newline */
@@ -368,7 +375,7 @@ pg_regcomp(regex_t *re,
     v->subs = v->sub10;
     v->nsubs = 10;
     for (j = 0; j < v->nsubs; j++)
-        v->subs[j] = NULL;
+        v->subs[j].left = v->subs[j].right = NULL;
     v->nfa = NULL;
     v->cm = NULL;
     v->nlcolor = COLORLESS;
@@ -504,13 +511,13 @@ pg_regcomp(regex_t *re,
 }

 /*
- * moresubs - enlarge subRE vector
+ * moresubs - enlarge capturing-subexpressions vector
  */
 static void
 moresubs(struct vars *v,
          int wanted)            /* want enough room for this one */
 {
-    struct subre **p;
+    struct subinfo *p;
     size_t        n;

     assert(wanted > 0 && (size_t) wanted >= v->nsubs);
@@ -518,13 +525,13 @@ moresubs(struct vars *v,

     if (v->subs == v->sub10)
     {
-        p = (struct subre **) MALLOC(n * sizeof(struct subre *));
+        p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
         if (p != NULL)
             memcpy(VS(p), VS(v->subs),
-                   v->nsubs * sizeof(struct subre *));
+                   v->nsubs * sizeof(struct subinfo));
     }
     else
-        p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
+        p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
     if (p == NULL)
     {
         ERR(REG_ESPACE);
@@ -532,7 +539,7 @@ moresubs(struct vars *v,
     }
     v->subs = p;
     for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
-        *p = NULL;
+        p->left = p->right = NULL;
     assert(v->nsubs == n);
     assert((size_t) wanted < v->nsubs);
 }
@@ -982,8 +989,10 @@ parseqatom(struct vars *v,
             NOERR();
             if (cap)
             {
-                assert(v->subs[subno] == NULL);
-                v->subs[subno] = atom;
+                /* save the sub-NFA's endpoints for future backrefs to use */
+                assert(v->subs[subno].left == NULL);
+                v->subs[subno].left = s;
+                v->subs[subno].right = s2;
                 if (atom->capno == 0)
                 {
                     /* normal case: just mark the atom as capturing */
@@ -1005,7 +1014,7 @@ parseqatom(struct vars *v,
         case BACKREF:            /* the Feature From The Black Lagoon */
             INSIST(type != LACON, REG_ESUBREG);
             INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
-            INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+            INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
             NOERR();
             assert(v->nextvalue > 0);
             atom = subre(v, 'b', BACKR, lp, rp);
@@ -1080,7 +1089,7 @@ parseqatom(struct vars *v,
         if (atom != NULL)
             freesubre(v, atom);
         if (atomtype == '(')
-            v->subs[subno] = NULL;
+            v->subs[subno].left = v->subs[subno].right = NULL;
         delsub(v->nfa, lp, rp);
         EMPTYARC(lp, rp);
         return;
@@ -1173,14 +1182,14 @@ parseqatom(struct vars *v,
     {
         assert(atom->begin->nouts == 1);    /* just the EMPTY */
         delsub(v->nfa, atom->begin, atom->end);
-        assert(v->subs[subno] != NULL);
+        assert(v->subs[subno].left != NULL);

         /*
          * And here's why the recursion got postponed: it must wait until the
          * skeleton is filled in, because it may hit a backref that wants to
          * copy the filled-in skeleton.
          */
-        dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
+        dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
                atom->begin, atom->end);
         NOERR();

diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out
index 01d50ec1e3..44da7d2019 100644
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -3468,6 +3468,14 @@ select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M'
  {" TO foo",foo,o,NULL}
 (2 rows)

+-- expectMatch    21.36 RPQ    ((.))(\2){0}    xy    x    x    x    {}
+select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
+                 test_regex
+--------------------------------------------
+ {3,REG_UBACKREF,REG_UBOUNDS,REG_UNONPOSIX}
+ {x,x,x,NULL}
+(2 rows)
+
 -- doing 22 "multicharacter collating elements"
 -- # again ugh
 -- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 7f5bc6e418..9224fdfdd3 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -1009,6 +1009,8 @@ select * from test_regex('(.*).*', 'abc', 'N');
 select * from test_regex('(a*)*', 'bc', 'N');
 -- expectMatch    21.35 M        { TO (([a-z0-9._]+|"([^"]+|"")+")+)}    {asd TO foo}    { TO foo} foo o {}
 select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
+-- expectMatch    21.36 RPQ    ((.))(\2){0}    xy    x    x    x    {}
+select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');

 -- doing 22 "multicharacter collating elements"
 -- # again ugh

В списке pgsql-hackers по дате отправления:

Предыдущее
От: ilmari@ilmari.org (Dagfinn Ilmari Mannsåker)
Дата:
Сообщение: [PATCH] Add tab completion for EXECUTE after EXPLAIN
Следующее
От: Andrew Dunstan
Дата:
Сообщение: Re: very long record lines in expanded psql output