Bizarre behavior of literal-substring parsing in formatting.c

Поиск
Список
Период
Сортировка
От Tom Lane
Тема Bizarre behavior of literal-substring parsing in formatting.c
Дата
Msg-id 3626.1510949486@sss.pgh.pa.us
обсуждение исходный текст
Список pgsql-hackers
I started to look at handling multibyte data more sanely in formatting.c,
and soon noticed that the treatment of literal substrings in
parse_format() seemed overly complicated and underly correct, as well
as inadequately documented.  In particular, it can't make up its mind
whether backslash is a quoting character or not.  These cases make it
look like it is:

regression=# select to_char('100'::numeric, 'f"\ool"999');
 to_char  
----------
 fool 100
(1 row)

regression=# select to_char('100'::numeric, 'f"\\ool"999');
  to_char  
-----------
 f\ool 100
(1 row)


regression=# select to_char('100'::numeric, 'f"ool\"999');
 to_char  
----------
 fool"999
(1 row)

(Since the second " is quoted, we never escape the literal substring,
and the format-code 999 becomes just literal data.)  But try this:

regression=# select to_char('100'::numeric, 'f"ool\\"999');
  to_char  
-----------
 fool\"999
(1 row)

The first backslash has quoted the second one, and then reached out
and quoted the quote too.  That hardly seems sane.

Outside double quotes, it seems that backslash is only special when it
immediately precedes a double quote.  That's not enormously consistent
with other string parsers in our code, but it's at least self-consistent,
and given the lack of complaints it's probably best not to change that
behavior.

Accordingly I offer the attached rewrite.  Of the new regression
test cases, the only one that changes behavior from before is the
last one, which is the case I just showed.

If no objections, I'll push this shortly.  I think putting it in
HEAD is enough, given the lack of field complaints.

            regards, tom lane

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 35a845c..698daf6 100644
*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
*************** SELECT regexp_match('abc01234xyz', '(?:(
*** 6196,6201 ****
--- 6196,6206 ----
         If you want to have a double quote in the output you must
         precede it with a backslash, for example <literal>'\"YYYY
         Month\"'</literal>. <!-- "" font-lock sanity :-) -->
+        Backslashes are not otherwise special outside of double-quoted
+        strings.  Within a double-quoted string, a backslash causes the
+        next character to be taken literally, whatever it is (but this
+        has no special effect unless the next character is a double quote
+        or another backslash).
        </para>
       </listitem>

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 5afc293..cb0dbf7 100644
*** a/src/backend/utils/adt/formatting.c
--- b/src/backend/utils/adt/formatting.c
*************** static void
*** 1227,1237 ****
  parse_format(FormatNode *node, const char *str, const KeyWord *kw,
               const KeySuffix *suf, const int *index, int ver, NUMDesc *Num)
  {
-     const KeySuffix *s;
      FormatNode *n;
-     int            node_set = 0,
-                 suffix,
-                 last = 0;

  #ifdef DEBUG_TO_FROM_CHAR
      elog(DEBUG_elog_output, "to_char/number(): run parser");
--- 1227,1233 ----
*************** parse_format(FormatNode *node, const cha
*** 1241,1252 ****

      while (*str)
      {
!         suffix = 0;

          /*
           * Prefix
           */
!         if (ver == DCH_TYPE && (s = suff_search(str, suf, SUFFTYPE_PREFIX)) != NULL)
          {
              suffix |= s->id;
              if (s->len)
--- 1237,1250 ----

      while (*str)
      {
!         int            suffix = 0;
!         const KeySuffix *s;

          /*
           * Prefix
           */
!         if (ver == DCH_TYPE &&
!             (s = suff_search(str, suf, SUFFTYPE_PREFIX)) != NULL)
          {
              suffix |= s->id;
              if (s->len)
*************** parse_format(FormatNode *node, const cha
*** 1259,1266 ****
          if (*str && (n->key = index_seq_search(str, kw, index)) != NULL)
          {
              n->type = NODE_TYPE_ACTION;
!             n->suffix = 0;
!             node_set = 1;
              if (n->key->len)
                  str += n->key->len;

--- 1257,1263 ----
          if (*str && (n->key = index_seq_search(str, kw, index)) != NULL)
          {
              n->type = NODE_TYPE_ACTION;
!             n->suffix = suffix;
              if (n->key->len)
                  str += n->key->len;

*************** parse_format(FormatNode *node, const cha
*** 1273,1343 ****
              /*
               * Postfix
               */
!             if (ver == DCH_TYPE && *str && (s = suff_search(str, suf, SUFFTYPE_POSTFIX)) != NULL)
              {
!                 suffix |= s->id;
                  if (s->len)
                      str += s->len;
              }
          }
          else if (*str)
          {
              /*
!              * Special characters '\' and '"'
               */
!             if (*str == '"' && last != '\\')
              {
-                 int            x = 0;
-
                  while (*(++str))
                  {
!                     if (*str == '"' && x != '\\')
                      {
                          str++;
                          break;
                      }
!                     else if (*str == '\\' && x != '\\')
!                     {
!                         x = '\\';
!                         continue;
!                     }
                      n->type = NODE_TYPE_CHAR;
                      n->character = *str;
                      n->key = NULL;
                      n->suffix = 0;
!                     ++n;
!                     x = *str;
                  }
-                 node_set = 0;
-                 suffix = 0;
-                 last = 0;
-             }
-             else if (*str && *str == '\\' && last != '\\' && *(str + 1) == '"')
-             {
-                 last = *str;
-                 str++;
              }
!             else if (*str)
              {
                  n->type = NODE_TYPE_CHAR;
                  n->character = *str;
                  n->key = NULL;
!                 node_set = 1;
!                 last = 0;
                  str++;
              }
          }
-
-         /* end */
-         if (node_set)
-         {
-             if (n->type == NODE_TYPE_ACTION)
-                 n->suffix = suffix;
-             ++n;
-
-             n->suffix = 0;
-             node_set = 0;
-         }
      }

      n->type = NODE_TYPE_END;
--- 1270,1325 ----
              /*
               * Postfix
               */
!             if (ver == DCH_TYPE && *str &&
!                 (s = suff_search(str, suf, SUFFTYPE_POSTFIX)) != NULL)
              {
!                 n->suffix |= s->id;
                  if (s->len)
                      str += s->len;
              }
+
+             n++;
          }
          else if (*str)
          {
              /*
!              * Process double-quoted literal string, if any
               */
!             if (*str == '"')
              {
                  while (*(++str))
                  {
!                     if (*str == '"')
                      {
                          str++;
                          break;
                      }
!                     /* backslash quotes the next character, if any */
!                     if (*str == '\\' && *(str + 1))
!                         str++;
                      n->type = NODE_TYPE_CHAR;
                      n->character = *str;
                      n->key = NULL;
                      n->suffix = 0;
!                     n++;
                  }
              }
!             else
              {
+                 /*
+                  * Outside double-quoted strings, backslash is only special if
+                  * it immediately precedes a double quote.
+                  */
+                 if (*str == '\\' && *(str + 1) == '"')
+                     str++;
                  n->type = NODE_TYPE_CHAR;
                  n->character = *str;
                  n->key = NULL;
!                 n->suffix = 0;
!                 n++;
                  str++;
              }
          }
      }

      n->type = NODE_TYPE_END;
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index a96bfc0..17985e8 100644
*** a/src/test/regress/expected/numeric.out
--- b/src/test/regress/expected/numeric.out
*************** SELECT '' AS to_char_26, to_char('100'::
*** 1217,1222 ****
--- 1217,1283 ----
              | 100
  (1 row)

+ -- Check parsing of literal text in a format string
+ SELECT '' AS to_char_27, to_char('100'::numeric, 'foo999');
+  to_char_27 | to_char
+ ------------+---------
+             | foo 100
+ (1 row)
+
+ SELECT '' AS to_char_28, to_char('100'::numeric, 'f\oo999');
+  to_char_28 | to_char
+ ------------+----------
+             | f\oo 100
+ (1 row)
+
+ SELECT '' AS to_char_29, to_char('100'::numeric, 'f\\oo999');
+  to_char_29 |  to_char
+ ------------+-----------
+             | f\\oo 100
+ (1 row)
+
+ SELECT '' AS to_char_30, to_char('100'::numeric, 'f\"oo999');
+  to_char_30 | to_char
+ ------------+----------
+             | f"oo 100
+ (1 row)
+
+ SELECT '' AS to_char_31, to_char('100'::numeric, 'f\\"oo999');
+  to_char_31 |  to_char
+ ------------+-----------
+             | f\"oo 100
+ (1 row)
+
+ SELECT '' AS to_char_32, to_char('100'::numeric, 'f"ool"999');
+  to_char_32 | to_char
+ ------------+----------
+             | fool 100
+ (1 row)
+
+ SELECT '' AS to_char_33, to_char('100'::numeric, 'f"\ool"999');
+  to_char_33 | to_char
+ ------------+----------
+             | fool 100
+ (1 row)
+
+ SELECT '' AS to_char_34, to_char('100'::numeric, 'f"\\ool"999');
+  to_char_34 |  to_char
+ ------------+-----------
+             | f\ool 100
+ (1 row)
+
+ SELECT '' AS to_char_35, to_char('100'::numeric, 'f"ool\"999');
+  to_char_35 | to_char
+ ------------+----------
+             | fool"999
+ (1 row)
+
+ SELECT '' AS to_char_36, to_char('100'::numeric, 'f"ool\\"999');
+  to_char_36 |  to_char
+ ------------+-----------
+             | fool\ 100
+ (1 row)
+
  -- TO_NUMBER()
  --
  SET lc_numeric = 'C';
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 321c7bd..d77504e 100644
*** a/src/test/regress/sql/numeric.sql
--- b/src/test/regress/sql/numeric.sql
*************** SELECT '' AS to_char_24, to_char('100'::
*** 786,791 ****
--- 786,803 ----
  SELECT '' AS to_char_25, to_char('100'::numeric, 'FM999.');
  SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999');

+ -- Check parsing of literal text in a format string
+ SELECT '' AS to_char_27, to_char('100'::numeric, 'foo999');
+ SELECT '' AS to_char_28, to_char('100'::numeric, 'f\oo999');
+ SELECT '' AS to_char_29, to_char('100'::numeric, 'f\\oo999');
+ SELECT '' AS to_char_30, to_char('100'::numeric, 'f\"oo999');
+ SELECT '' AS to_char_31, to_char('100'::numeric, 'f\\"oo999');
+ SELECT '' AS to_char_32, to_char('100'::numeric, 'f"ool"999');
+ SELECT '' AS to_char_33, to_char('100'::numeric, 'f"\ool"999');
+ SELECT '' AS to_char_34, to_char('100'::numeric, 'f"\\ool"999');
+ SELECT '' AS to_char_35, to_char('100'::numeric, 'f"ool\"999');
+ SELECT '' AS to_char_36, to_char('100'::numeric, 'f"ool\\"999');
+
  -- TO_NUMBER()
  --
  SET lc_numeric = 'C';

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Robert Haas
Дата:
Сообщение: Re: bgwriter_lru_maxpages range in postgresql.conf
Следующее
От: Pavel Stehule
Дата:
Сообщение: Re: pspg - psql pager