Re: Move PinBuffer and UnpinBuffer to atomics

Поиск
Список
Период
Сортировка
От Andres Freund
Тема Re: Move PinBuffer and UnpinBuffer to atomics
Дата
Msg-id 20160328135937.x7l3gp5zivissdgf@awork2.anarazel.de
обсуждение исходный текст
Ответ на Re: Move PinBuffer and UnpinBuffer to atomics  (Alexander Korotkov <a.korotkov@postgrespro.ru>)
Ответы Re: Move PinBuffer and UnpinBuffer to atomics  (Alvaro Herrera <alvherre@2ndquadrant.com>)
Re: Move PinBuffer and UnpinBuffer to atomics  (Alexander Korotkov <a.korotkov@postgrespro.ru>)
Список pgsql-hackers
On 2016-03-28 15:46:43 +0300, Alexander Korotkov wrote:
> diff --git a/src/backend/storage/buffer/bufmnew file mode 100644
> index 6dd7c6e..fe6fb9c
> --- a/src/backend/storage/buffer/bufmgr.c
> +++ b/src/backend/storage/buffer/bufmgr.c
> @@ -52,7 +52,6 @@
>  #include "utils/resowner_private.h"
>  #include "utils/timestamp.h"
>  
> -
>  /* Note: these two macros only work on shared buffers, not local ones! */
>  #define BufHdrGetBlock(bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
>  #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

spurious

> @@ -848,7 +852,7 @@ ReadBuffer_common(SMgrRelation smgr, cha
>       * it's not been recycled) but come right back here to try smgrextend
>       * again.
>       */
> -    Assert(!(bufHdr->flags & BM_VALID));        /* spinlock not needed */
> +    Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
>  
>      bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
>  
> @@ -932,8 +936,13 @@ ReadBuffer_common(SMgrRelation smgr, cha
>  
>      if (isLocalBuf)
>      {
> -        /* Only need to adjust flags */
> -        bufHdr->flags |= BM_VALID;
> +        /*
> +         * Only need to adjust flags.  Since it's local buffer, there is no
> +         * concurrency.  We assume read/write pair to be cheaper than atomic OR.
> +         */
> +        uint32 state = pg_atomic_read_u32(&bufHdr->state);
> +        state |= BM_VALID;
> +        pg_atomic_write_u32(&bufHdr->state, state);

I'm not a fan of repeating that comment in multiple places. Hm.


>      }
>      else
>      {
> @@ -987,10 +996,11 @@ BufferAlloc(SMgrRelation smgr, char relp
>      BufferTag    oldTag;            /* previous identity of selected buffer */
>      uint32        oldHash;        /* hash value for oldTag */
>      LWLock       *oldPartitionLock;        /* buffer partition lock for it */
> -    BufFlags    oldFlags;
> +    uint32        oldFlags;
>      int            buf_id;
>      BufferDesc *buf;
>      bool        valid;
> +    uint32        state;
>  
>      /* create a tag so we can lookup the buffer */
>      INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
> @@ -1050,23 +1060,23 @@ BufferAlloc(SMgrRelation smgr, char relp
>      for (;;)
>      {
>          /*
> -         * Ensure, while the spinlock's not yet held, that there's a free
> +         * Ensure, while the header lock isn't yet held, that there's a free
>           * refcount entry.
>           */
>          ReservePrivateRefCountEntry();
>  
>          /*
>           * Select a victim buffer.  The buffer is returned with its header
> -         * spinlock still held!
> +         * lock still held!
>           */
> -        buf = StrategyGetBuffer(strategy);
> +        buf = StrategyGetBuffer(strategy, &state);

The new thing really still is a spinlock, not sure if it's worth
changing the comments that way.


> @@ -1319,7 +1330,7 @@ BufferAlloc(SMgrRelation smgr, char relp
>   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
>   * freelist.
>   *
> - * The buffer header spinlock must be held at entry.  We drop it before
> + * The buffer header lock must be held at entry.  We drop it before
>   * returning.  (This is sane because the caller must have locked the
>   * buffer in order to be sure it should be dropped.)
>   *

Ok, by now I'm pretty sure that I don't want this to be changed
everywhere, just makes reviewing harder.



> @@ -1433,6 +1443,7 @@ void
>  MarkBufferDirty(Buffer buffer)
>  {
>      BufferDesc *bufHdr;
> +    uint32        state;
>  
>      if (!BufferIsValid(buffer))
>          elog(ERROR, "bad buffer ID: %d", buffer);
> @@ -1449,14 +1460,14 @@ MarkBufferDirty(Buffer buffer)
>      /* unfortunately we can't check if the lock is held exclusively */
>      Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
>  
> -    LockBufHdr(bufHdr);
> +    state = LockBufHdr(bufHdr);
>  
> -    Assert(bufHdr->refcount > 0);
> +    Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
>  
>      /*
>       * If the buffer was not dirty already, do vacuum accounting.
>       */
> -    if (!(bufHdr->flags & BM_DIRTY))
> +    if (!(state & BM_DIRTY))
>      {
>          VacuumPageDirty++;
>          pgBufferUsage.shared_blks_dirtied++;
> @@ -1464,9 +1475,9 @@ MarkBufferDirty(Buffer buffer)
>              VacuumCostBalance += VacuumCostPageDirty;
>      }
>  
> -    bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
> -
> -    UnlockBufHdr(bufHdr);
> +    state |= BM_DIRTY | BM_JUST_DIRTIED;
> +    state &= ~BM_LOCKED;
> +    pg_atomic_write_u32(&bufHdr->state, state);
>  }

Hm, this is a routine I think we should avoid taking the lock in; it's
often called quite frequently. Also doesn't look very hard.

>  static bool
>  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
> @@ -1547,23 +1563,44 @@ PinBuffer(BufferDesc *buf, BufferAccessS
>  
>      if (ref == NULL)
>      {
> +        /* loop of CAS operations */
> +        uint32            state;
> +        uint32            oldstate;
> +        SpinDelayStatus    delayStatus;



>          ReservePrivateRefCountEntry();
>          ref = NewPrivateRefCountEntry(b);
>  
> -        LockBufHdr(buf);
> -        buf->refcount++;
> -        if (strategy == NULL)
> -        {
> -            if (buf->usage_count < BM_MAX_USAGE_COUNT)
> -                buf->usage_count++;
> -        }
> -        else
> +        state = pg_atomic_read_u32(&buf->state);
> +        oldstate = state;
> +
> +        init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);

Hm. This way we're calling this on every iteration. That doesn't seem
like a good idea. How about making delayStatus a static, and
init_spin_delay a macro which returns a {struct, member, one, two} type
literal?


> +        while (true)
>          {
> -            if (buf->usage_count == 0)
> -                buf->usage_count = 1;
> +            /* spin-wait till lock is free */
> +            while (state & BM_LOCKED)
> +            {
> +                make_spin_delay(&delayStatus);
> +                state = pg_atomic_read_u32(&buf->state);
> +                oldstate = state;
> +            }

Maybe we should abstract that to pg_atomic_wait_bit_unset_u32()? It
seems quite likely we need this in other places (e.g. lwlock.c itself).


> +            /* increase refcount */
> +            state += BUF_REFCOUNT_ONE;
> +
> +            /* increase usagecount unless already max */
> +            if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
> +                state += BUF_USAGECOUNT_ONE;
> +
> +            /* try to do CAS, exit on success */

Seems like a somewhat obvious comment?


> @@ -1603,15 +1640,22 @@ PinBuffer_Locked(BufferDesc *buf)
>  {

>      Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
>  
> -    buf->refcount++;
> -    UnlockBufHdr(buf);
> +    /*
> +     * Since we assume to held buffer header lock, we can update the buffer
> +     * state in a single write operation.
> +     */
> +    state = pg_atomic_read_u32(&buf->state);
> +    state += 1;
> +    state &= ~BM_LOCKED;
> +    pg_atomic_write_u32(&buf->state, state);

Te comment should probably mention that we're releasing the
spinlock. And the += 1 should be a BUF_REFCOUNT_ONE, otherwise it's hard
to understand.

> @@ -1646,30 +1690,66 @@ UnpinBuffer(BufferDesc *buf, bool fixOwn
>      ref->refcount--;
>      if (ref->refcount == 0)
>      {

> +
> +        /* Support LockBufferForCleanup() */
> +        if (state & BM_PIN_COUNT_WAITER)
> +        {
> +            state = LockBufHdr(buf);
> +
> +            if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
> +            {
> +                /* we just released the last pin other than the waiter's */
> +                int            wait_backend_pid = buf->wait_backend_pid;
>  
> +                state &= ~(BM_PIN_COUNT_WAITER | BM_LOCKED);
> +                pg_atomic_write_u32(&buf->state, state);
> +                ProcSendSignal(wait_backend_pid);
> +            }
> +            else
> +                UnlockBufHdr(buf);
> +        }

I think it's quite confusing to use UnlockBufHdr and direct bit
expressions in one branch.

Thinking about it I also don't think the pg_atomic_write_u32 variant is
correct without adding a write barrier; the other side might not see the
values yet.

I think we can just redefine UnlockBufHdr() to be a
pg_atomic_write_u32() and pg_write_memory_barrier()?



>       * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
> -     * need not bother with the buffer header spinlock.  Even if someone else
> +     * need not bother with the buffer header lock.  Even if someone else
>       * changes the buffer header flags while we're doing this, we assume that
>       * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
>       * old value or the new value, but not random garbage.
>       */

The rest of the comment is outdated, BufFlags isn't a 2 byte value
anymore.

> @@ -3078,7 +3168,7 @@ FlushRelationBuffers(Relation rel)
>                            localpage,
>                            false);
>  
> -                bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
> +                pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));

Hm, in other places you replaced atomics on local buffers with plain
writes.

>              lsn = XLogSaveBufferForHint(buffer, buffer_std);
>          }
>  
> -        LockBufHdr(bufHdr);
> -        Assert(bufHdr->refcount > 0);
> -        if (!(bufHdr->flags & BM_DIRTY))
> +        state = LockBufHdr(bufHdr);
> +
> +        Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
> +
> +        if (!(state & BM_DIRTY))
>          {
>              dirtied = true;        /* Means "will be dirtied by this action" */

It's again worthwhile to try make this work without taking the lock.


> -    buf->flags |= BM_IO_IN_PROGRESS;
> -
> -    UnlockBufHdr(buf);
> +    state |= BM_IO_IN_PROGRESS;
> +    state &= ~BM_LOCKED;
> +    pg_atomic_write_u32(&buf->state, state);

How about making UnlockBufHdr() take a new state parameter, and
internally unset BM_LOCKED?


>  /*
> + * Lock buffer header - set BM_LOCKED in buffer state.
> + */
> +uint32
> +LockBufHdr(volatile BufferDesc *desc)
> +{
> +    SpinDelayStatus    delayStatus;
> +    uint32            state;
> +
> +    init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
> +
> +    state = pg_atomic_read_u32(&desc->state);
> +
> +    for (;;)
> +    {
> +        /* wait till lock is free */
> +        while (state & BM_LOCKED)
> +        {
> +            make_spin_delay(&delayStatus);
> +            state = pg_atomic_read_u32(&desc->state);
> +            /* Add exponential backoff? Should seldomly be contended tho. */

Outdated comment.


> +/*
> + * Unlock buffer header - unset BM_LOCKED in buffer state.
> + */
> +void
> +UnlockBufHdr(volatile BufferDesc *desc)
> +{
> +    Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
> +
> +    pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
> +}

As suggested above, there's likely no need to use an actual atomic op
here.

Greetings,

Andres Freund



В списке pgsql-hackers по дате отправления:

Предыдущее
От: Anastasia Lubennikova
Дата:
Сообщение: Re: WIP: Covering + unique indexes.
Следующее
От: Tom Lane
Дата:
Сообщение: Re: Combinations of pg_strdup/free in pg_dump code