Re: OOM in spgist insert

Поиск
Список
Период
Сортировка
От	Tom Lane
Тема	Re: OOM in spgist insert
Дата	14 мая 2021 г. 04:01:18
Msg-id	1785221.1620954078@sss.pgh.pa.us обсуждение исходный текст
Ответ на	Re: OOM in spgist insert (Alvaro Herrera <alvherre@alvh.no-ip.org>)
Ответы	Re: OOM in spgist insert (Tom Lane <tgl@sss.pgh.pa.us>) Re: OOM in spgist insert (Dilip Kumar <dilipbalaut@gmail.com>)
Список	pgsql-hackers
Дерево обсуждения
Alvaro Herrera <alvherre@alvh.no-ip.org> writes:
> On 2021-May-13, Tom Lane wrote:
>> What do people think about back-patching this?  In existing branches,
>> we've defined it to be an opclass bug if it fails to shorten the leaf
>> datum enough.  But not having any defenses against that seems like
>> not a great idea.  OTOH, the 10-cycles-to-show-progress rule could be
>> argued to be an API break.

> I think if the alternative is to throw an error, we can afford to retry
> quite a few more times than 10 in order not have that called an API
> break.  Say, retry (MAXIMUM_ALIGNOF << 3) times or so (if you want to
> parameterize on maxalign).  It's not like this is going to be a
> performance drag where not needed .. but I think leaving back-branches
> unfixed is not great.

Hm.  Index bloat is not something to totally ignore, though, so I'm
not sure what the best cutoff is.

Anyway, here is a patch set teased apart into committable bites,
and with your other points addressed.

            regards, tom lane

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 6200699ddd..dd2ade7bb6 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -554,7 +554,7 @@ ProcessClientWriteInterrupt(bool blocked)
         {
             /*
              * Don't mess with whereToSendOutput if ProcessInterrupts wouldn't
-             * do anything.
+             * service ProcDiePending.
              */
             if (InterruptHoldoffCount == 0 && CritSectionCount == 0)
             {
@@ -3118,6 +3118,12 @@ RecoveryConflictInterrupt(ProcSignalReason reason)
  * If an interrupt condition is pending, and it's safe to service it,
  * then clear the flag and accept the interrupt.  Called only when
  * InterruptPending is true.
+ *
+ * Note: if INTERRUPTS_CAN_BE_PROCESSED() is true, then ProcessInterrupts
+ * is guaranteed to clear the InterruptPending flag before returning.
+ * (This is not the same as guaranteeing that it's still clear when we
+ * return; another interrupt could have arrived.  But we promise that
+ * any pre-existing one will have been serviced.)
  */
 void
 ProcessInterrupts(void)
@@ -3248,7 +3254,11 @@ ProcessInterrupts(void)
     {
         /*
          * Re-arm InterruptPending so that we process the cancel request as
-         * soon as we're done reading the message.
+         * soon as we're done reading the message.  (XXX this is seriously
+         * ugly: it complicates INTERRUPTS_CAN_BE_PROCESSED(), and it means we
+         * can't use that macro directly as the initial test in this function,
+         * meaning that this code also creates opportunities for other bugs to
+         * appear.)
          */
         InterruptPending = true;
     }
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 95202d37af..b95bb956b6 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -57,6 +57,15 @@
  * allowing die interrupts: HOLD_CANCEL_INTERRUPTS() and
  * RESUME_CANCEL_INTERRUPTS().
  *
+ * Note that ProcessInterrupts() has also acquired a number of tasks that
+ * do not necessarily cause a query-cancel-or-die response.  Hence, it's
+ * possible that it will just clear InterruptPending and return.
+ *
+ * INTERRUPTS_PENDING_CONDITION() can be checked to see whether an
+ * interrupt needs to be serviced, without trying to do so immediately.
+ * Some callers are also interested in INTERRUPTS_CAN_BE_PROCESSED(),
+ * which tells whether ProcessInterrupts is sure to clear the interrupt.
+ *
  * Special mechanisms are used to let an interrupt be accepted when we are
  * waiting for a lock or when we are waiting for command input (but, of
  * course, only if the interrupt holdoff counter is zero).  See the
@@ -97,24 +106,27 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount;
 /* in tcop/postgres.c */
 extern void ProcessInterrupts(void);

+/* Test whether an interrupt is pending */
 #ifndef WIN32
+#define INTERRUPTS_PENDING_CONDITION() \
+    (unlikely(InterruptPending))
+#else
+#define INTERRUPTS_PENDING_CONDITION() \
+    (unlikely(UNBLOCKED_SIGNAL_QUEUE()) ? pgwin32_dispatch_queued_signals() : 0,  \
+     unlikely(InterruptPending))
+#endif

+/* Service interrupt if one is pending */
 #define CHECK_FOR_INTERRUPTS() \
 do { \
-    if (unlikely(InterruptPending)) \
-        ProcessInterrupts(); \
-} while(0)
-#else                            /* WIN32 */
-
-#define CHECK_FOR_INTERRUPTS() \
-do { \
-    if (unlikely(UNBLOCKED_SIGNAL_QUEUE())) \
-        pgwin32_dispatch_queued_signals(); \
-    if (unlikely(InterruptPending)) \
+    if (INTERRUPTS_PENDING_CONDITION()) \
         ProcessInterrupts(); \
 } while(0)
-#endif                            /* WIN32 */

+/* Is ProcessInterrupts() guaranteed to clear InterruptPending? */
+#define INTERRUPTS_CAN_BE_PROCESSED() \
+    (InterruptHoldoffCount == 0 && CritSectionCount == 0 && \
+     QueryCancelHoldoffCount == 0)

 #define HOLD_INTERRUPTS()  (InterruptHoldoffCount++)

diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
index 4d380c99f0..88ae2499dd 100644
--- a/src/backend/access/spgist/spgdoinsert.c
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -1905,13 +1905,14 @@ spgSplitNodeAction(Relation index, SpGistState *state,
  * Insert one item into the index.
  *
  * Returns true on success, false if we failed to complete the insertion
- * because of conflict with a concurrent insert.  In the latter case,
- * caller should re-call spgdoinsert() with the same args.
+ * (typically because of conflict with a concurrent insert).  In the latter
+ * case, caller should re-call spgdoinsert() with the same args.
  */
 bool
 spgdoinsert(Relation index, SpGistState *state,
             ItemPointer heapPtr, Datum *datums, bool *isnulls)
 {
+    bool        result = true;
     TupleDesc    leafDescriptor = state->leafTupDesc;
     bool        isnull = isnulls[spgKeyColumn];
     int            level = 0;
@@ -2012,6 +2013,14 @@ spgdoinsert(Relation index, SpGistState *state,
     parent.offnum = InvalidOffsetNumber;
     parent.node = -1;

+    /*
+     * Before entering the loop, try to clear any pending interrupt condition.
+     * If a query cancel is pending, we might as well accept it now not later;
+     * while if a non-canceling condition is pending, servicing it here avoids
+     * having to restart the insertion and redo all the work so far.
+     */
+    CHECK_FOR_INTERRUPTS();
+
     for (;;)
     {
         bool        isNew = false;
@@ -2019,9 +2028,18 @@ spgdoinsert(Relation index, SpGistState *state,
         /*
          * Bail out if query cancel is pending.  We must have this somewhere
          * in the loop since a broken opclass could produce an infinite
-         * picksplit loop.
+         * picksplit loop.  However, because we'll be holding buffer lock(s)
+         * after the first iteration, ProcessInterrupts() wouldn't be able to
+         * throw a cancel error here.  Hence, if we see that an interrupt is
+         * pending, break out of the loop and deal with the situation below.
+         * Set result = false because we must restart the insertion if the
+         * interrupt isn't a query-cancel-or-die case.
          */
-        CHECK_FOR_INTERRUPTS();
+        if (INTERRUPTS_PENDING_CONDITION())
+        {
+            result = false;
+            break;
+        }

         if (current.blkno == InvalidBlockNumber)
         {
@@ -2140,10 +2158,14 @@ spgdoinsert(Relation index, SpGistState *state,
              * spgAddNode and spgSplitTuple cases will loop back to here to
              * complete the insertion operation.  Just in case the choose
              * function is broken and produces add or split requests
-             * repeatedly, check for query cancel.
+             * repeatedly, check for query cancel (see comments above).
              */
     process_inner_tuple:
-            CHECK_FOR_INTERRUPTS();
+            if (INTERRUPTS_PENDING_CONDITION())
+            {
+                result = false;
+                break;
+            }

             innerTuple = (SpGistInnerTuple) PageGetItem(current.page,
                                                         PageGetItemId(current.page, current.offnum));
@@ -2267,5 +2289,21 @@ spgdoinsert(Relation index, SpGistState *state,
         UnlockReleaseBuffer(parent.buffer);
     }

-    return true;
+    /*
+     * We do not support being called while some outer function is holding a
+     * buffer lock (or any other reason to postpone query cancels).  If that
+     * were the case, telling the caller to retry would create an infinite
+     * loop.
+     */
+    Assert(INTERRUPTS_CAN_BE_PROCESSED());
+
+    /*
+     * Finally, check for interrupts again.  If there was a query cancel,
+     * ProcessInterrupts() will be able to throw the error here.  If it was
+     * some other kind of interrupt that can just be cleared, return false to
+     * tell our caller to retry.
+     */
+    CHECK_FOR_INTERRUPTS();
+
+    return result;
 }
diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
index 88ae2499dd..520983ede1 100644
--- a/src/backend/access/spgist/spgdoinsert.c
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -669,7 +669,8 @@ checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig,
  * will eventually terminate if lack of balance is the issue.  If the tuple
  * is too big, we assume that repeated picksplit operations will eventually
  * make it small enough by repeated prefix-stripping.  A broken opclass could
- * make this an infinite loop, though.
+ * make this an infinite loop, though, so spgdoinsert() checks that the
+ * leaf datums get smaller each time.
  */
 static bool
 doPickSplit(Relation index, SpGistState *state,
@@ -1918,6 +1919,8 @@ spgdoinsert(Relation index, SpGistState *state,
     int            level = 0;
     Datum        leafDatums[INDEX_MAX_KEYS];
     int            leafSize;
+    int            prevLeafSize;
+    int            numNoProgressCycles = 0;
     SPPageDesc    current,
                 parent;
     FmgrInfo   *procinfo = NULL;
@@ -1988,9 +1991,10 @@ spgdoinsert(Relation index, SpGistState *state,

     /*
      * If it isn't gonna fit, and the opclass can't reduce the datum size by
-     * suffixing, bail out now rather than getting into an endless loop.
+     * suffixing, bail out now rather than doing a lot of useless work.
      */
-    if (leafSize > SPGIST_PAGE_CAPACITY && !state->config.longValuesOK)
+    if (leafSize > SPGIST_PAGE_CAPACITY &&
+        (isnull || !state->config.longValuesOK))
         ereport(ERROR,
                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                  errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
@@ -2218,6 +2222,7 @@ spgdoinsert(Relation index, SpGistState *state,
                     /* Adjust level as per opclass request */
                     level += out.result.matchNode.levelAdd;
                     /* Replace leafDatum and recompute leafSize */
+                    prevLeafSize = leafSize;
                     if (!isnull)
                     {
                         leafDatums[spgKeyColumn] = out.result.matchNode.restDatum;
@@ -2226,19 +2231,50 @@ spgdoinsert(Relation index, SpGistState *state,
                         leafSize += sizeof(ItemIdData);
                     }

+                    /*
+                     * Check new tuple size; fail if it can't fit, unless the
+                     * opclass says it can handle the situation by suffixing.
+                     * In that case, prevent an infinite loop by checking to
+                     * see if we are actually making progress by reducing the
+                     * leafSize size in each pass.  This is a bit tricky
+                     * though.  Because of alignment considerations, the total
+                     * tuple size might not decrease on every pass.  Also,
+                     * there are edge cases where the choose method might seem
+                     * to not make progress for a cycle or two.  Somewhat
+                     * arbitrarily, we allow up to 10 no-progress iterations
+                     * before failing.  (The limit should be more than
+                     * MAXALIGN, to accommodate opclasses that trim one byte
+                     * from the leaf datum per pass.)
+                     */
+                    if (leafSize > SPGIST_PAGE_CAPACITY)
+                    {
+                        bool        ok = false;
+
+                        if (state->config.longValuesOK && !isnull)
+                        {
+                            if (leafSize < prevLeafSize)
+                            {
+                                ok = true;
+                                numNoProgressCycles = 0;
+                            }
+                            else if (++numNoProgressCycles < 10)
+                                ok = true;
+                        }
+                        if (!ok)
+                            ereport(ERROR,
+                                    (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                     errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+                                            leafSize - sizeof(ItemIdData),
+                                            SPGIST_PAGE_CAPACITY - sizeof(ItemIdData),
+                                            RelationGetRelationName(index)),
+                                     errhint("Values larger than a buffer page cannot be indexed.")));
+                    }
+
                     /*
                      * Loop around and attempt to insert the new leafDatum at
                      * "current" (which might reference an existing child
                      * tuple, or might be invalid to force us to find a new
                      * page for the tuple).
-                     *
-                     * Note: if the opclass sets longValuesOK, we rely on the
-                     * choose function to eventually shorten the leafDatum
-                     * enough to fit on a page.  We could add a test here to
-                     * complain if the datum doesn't get visibly shorter each
-                     * time, but that could get in the way of opclasses that
-                     * "simplify" datums in a way that doesn't necessarily
-                     * lead to physical shortening on every cycle.
                      */
                     break;
                 case spgAddNode:
diff --git a/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
b/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
index 0ac99d08fb..ac0ddcecea 100644
--- a/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
+++ b/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out
@@ -61,6 +61,12 @@ select * from t
  binary_upgrade_set_next_toast_pg_class_oid           |  1 | binary_upgrade_set_next_toast_pg_class_oid
 (9 rows)

+-- Verify clean failure when INCLUDE'd columns result in overlength tuple
+-- The error message details are platform-dependent, so show only SQLSTATE
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+ERROR:  54000
+\set VERBOSITY default
 drop index t_f1_f2_f3_idx;
 create index on t using spgist(f1 name_ops_old) include(f2, f3);
 \d+ t_f1_f2_f3_idx
@@ -100,3 +106,7 @@ select * from t
  binary_upgrade_set_next_toast_pg_class_oid           |  1 | binary_upgrade_set_next_toast_pg_class_oid
 (9 rows)

+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+ERROR:  54000
+\set VERBOSITY default
diff --git a/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
b/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
index 76e78ba41c..982f221a8b 100644
--- a/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
+++ b/src/test/modules/spgist_name_ops/sql/spgist_name_ops.sql
@@ -27,6 +27,12 @@ select * from t
   where f1 > 'binary_upgrade_set_n' and f1 < 'binary_upgrade_set_p'
   order by 1;

+-- Verify clean failure when INCLUDE'd columns result in overlength tuple
+-- The error message details are platform-dependent, so show only SQLSTATE
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+\set VERBOSITY default
+
 drop index t_f1_f2_f3_idx;

 create index on t using spgist(f1 name_ops_old) include(f2, f3);
@@ -39,3 +45,7 @@ select * from t
 select * from t
   where f1 > 'binary_upgrade_set_n' and f1 < 'binary_upgrade_set_p'
   order by 1;
+
+\set VERBOSITY sqlstate
+insert into t values(repeat('xyzzy', 12), 42, repeat('xyzzy', 4000));
+\set VERBOSITY default
В списке pgsql-hackers по дате отправления:
Предыдущее

От: Greg Nancarrow
Дата: 14 мая 2021 г., 03:37:25
Сообщение: Re: Executor code - found an instance of a WHILE that should just be an IF
Следующее

От: Peter Geoghegan
Дата: 14 мая 2021 г., 04:03:47
Сообщение: Re: Testing autovacuum wraparound (including failsafe)
Вход в личный кабинет

Восстановление пароля

Подтверждение аккаунта

Изменение пароля

Re: OOM in spgist insert

Предыдущее

Следующее