Re: Move PinBuffer and UnpinBuffer to atomics

Поиск
Список
Период
Сортировка
От Alexander Korotkov
Тема Re: Move PinBuffer and UnpinBuffer to atomics
Дата
Msg-id CAPpHfdtSu8nQ2yKj1tb5rKq3+w3Txtmek4Aip_z9Mc=+kGFNuA@mail.gmail.com
обсуждение исходный текст
Ответ на Re: Move PinBuffer and UnpinBuffer to atomics  (Andres Freund <andres@anarazel.de>)
Ответы Re: Move PinBuffer and UnpinBuffer to atomics  (Andres Freund <andres@anarazel.de>)
Re: Move PinBuffer and UnpinBuffer to atomics  (Amit Kapila <amit.kapila16@gmail.com>)
Список pgsql-hackers
Hi, Andres!

Please, find next revision of patch in attachment.

On Mon, Mar 28, 2016 at 4:59 PM, Andres Freund <andres@anarazel.de> wrote:
On 2016-03-28 15:46:43 +0300, Alexander Korotkov wrote:
> diff --git a/src/backend/storage/buffer/bufmnew file mode 100644
> index 6dd7c6e..fe6fb9c
> --- a/src/backend/storage/buffer/bufmgr.c
> +++ b/src/backend/storage/buffer/bufmgr.c
> @@ -52,7 +52,6 @@
>  #include "utils/resowner_private.h"
>  #include "utils/timestamp.h"
>
> -
>  /* Note: these two macros only work on shared buffers, not local ones! */
>  #define BufHdrGetBlock(bufHdr)       ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
>  #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))

spurious

Fixed.
 
> @@ -848,7 +852,7 @@ ReadBuffer_common(SMgrRelation smgr, cha
>        * it's not been recycled) but come right back here to try smgrextend
>        * again.
>        */
> -     Assert(!(bufHdr->flags & BM_VALID));            /* spinlock not needed */
> +     Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header lock not needed */
>
>       bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
>
> @@ -932,8 +936,13 @@ ReadBuffer_common(SMgrRelation smgr, cha
>
>       if (isLocalBuf)
>       {
> -             /* Only need to adjust flags */
> -             bufHdr->flags |= BM_VALID;
> +             /*
> +              * Only need to adjust flags.  Since it's local buffer, there is no
> +              * concurrency.  We assume read/write pair to be cheaper than atomic OR.
> +              */
> +             uint32 state = pg_atomic_read_u32(&bufHdr->state);
> +             state |= BM_VALID;
> +             pg_atomic_write_u32(&bufHdr->state, state);

I'm not a fan of repeating that comment in multiple places. Hm.

Moved comments into single place where macros for CAS loop is defined (see my comments below).
 
>       }
>       else
>       {
> @@ -987,10 +996,11 @@ BufferAlloc(SMgrRelation smgr, char relp
>       BufferTag       oldTag;                 /* previous identity of selected buffer */
>       uint32          oldHash;                /* hash value for oldTag */
>       LWLock     *oldPartitionLock;           /* buffer partition lock for it */
> -     BufFlags        oldFlags;
> +     uint32          oldFlags;
>       int                     buf_id;
>       BufferDesc *buf;
>       bool            valid;
> +     uint32          state;
>
>       /* create a tag so we can lookup the buffer */
>       INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
> @@ -1050,23 +1060,23 @@ BufferAlloc(SMgrRelation smgr, char relp
>       for (;;)
>       {
>               /*
> -              * Ensure, while the spinlock's not yet held, that there's a free
> +              * Ensure, while the header lock isn't yet held, that there's a free
>                * refcount entry.
>                */
>               ReservePrivateRefCountEntry();
>
>               /*
>                * Select a victim buffer.  The buffer is returned with its header
> -              * spinlock still held!
> +              * lock still held!
>                */
> -             buf = StrategyGetBuffer(strategy);
> +             buf = StrategyGetBuffer(strategy, &state);

The new thing really still is a spinlock, not sure if it's worth
changing the comments that way.


> @@ -1319,7 +1330,7 @@ BufferAlloc(SMgrRelation smgr, char relp
>   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
>   * freelist.
>   *
> - * The buffer header spinlock must be held at entry.  We drop it before
> + * The buffer header lock must be held at entry.  We drop it before
>   * returning.  (This is sane because the caller must have locked the
>   * buffer in order to be sure it should be dropped.)
>   *

Ok, by now I'm pretty sure that I don't want this to be changed
everywhere, just makes reviewing harder.

Fixed.
 
> @@ -1433,6 +1443,7 @@ void
>  MarkBufferDirty(Buffer buffer)
>  {
>       BufferDesc *bufHdr;
> +     uint32          state;
>
>       if (!BufferIsValid(buffer))
>               elog(ERROR, "bad buffer ID: %d", buffer);
> @@ -1449,14 +1460,14 @@ MarkBufferDirty(Buffer buffer)
>       /* unfortunately we can't check if the lock is held exclusively */
>       Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
>
> -     LockBufHdr(bufHdr);
> +     state = LockBufHdr(bufHdr);
>
> -     Assert(bufHdr->refcount > 0);
> +     Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
>
>       /*
>        * If the buffer was not dirty already, do vacuum accounting.
>        */
> -     if (!(bufHdr->flags & BM_DIRTY))
> +     if (!(state & BM_DIRTY))
>       {
>               VacuumPageDirty++;
>               pgBufferUsage.shared_blks_dirtied++;
> @@ -1464,9 +1475,9 @@ MarkBufferDirty(Buffer buffer)
>                       VacuumCostBalance += VacuumCostPageDirty;
>       }
>
> -     bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
> -
> -     UnlockBufHdr(bufHdr);
> +     state |= BM_DIRTY | BM_JUST_DIRTIED;
> +     state &= ~BM_LOCKED;
> +     pg_atomic_write_u32(&bufHdr->state, state);
>  }

Hm, this is a routine I think we should avoid taking the lock in; it's
often called quite frequently. Also doesn't look very hard.

Done.
 
>  static bool
>  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
> @@ -1547,23 +1563,44 @@ PinBuffer(BufferDesc *buf, BufferAccessS
>
>       if (ref == NULL)
>       {
> +             /* loop of CAS operations */
> +             uint32                  state;
> +             uint32                  oldstate;
> +             SpinDelayStatus delayStatus;



>               ReservePrivateRefCountEntry();
>               ref = NewPrivateRefCountEntry(b);
>
> -             LockBufHdr(buf);
> -             buf->refcount++;
> -             if (strategy == NULL)
> -             {
> -                     if (buf->usage_count < BM_MAX_USAGE_COUNT)
> -                             buf->usage_count++;
> -             }
> -             else
> +             state = pg_atomic_read_u32(&buf->state);
> +             oldstate = state;
> +
> +             init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);

Hm. This way we're calling this on every iteration. That doesn't seem
like a good idea. How about making delayStatus a static, and
init_spin_delay a macro which returns a {struct, member, one, two} type
literal?

Done.
 
> +             while (true)
>               {
> -                     if (buf->usage_count == 0)
> -                             buf->usage_count = 1;
> +                     /* spin-wait till lock is free */
> +                     while (state & BM_LOCKED)
> +                     {
> +                             make_spin_delay(&delayStatus);
> +                             state = pg_atomic_read_u32(&buf->state);
> +                             oldstate = state;
> +                     }

Maybe we should abstract that to pg_atomic_wait_bit_unset_u32()? It
seems quite likely we need this in other places (e.g. lwlock.c itself).

I have some doubts about such function.  At first, I can't find a place for it in lwlock.c.  It doesn't run explicit spin delays yet.  Is it related to some changes you're planning for lwlock.c.
At second, I doubt about it's signature.  It would be logical if it would be "uint32 pg_atomic_wait_bit_unset_u32(pg_atomic_u32 *var, uint32 mask)". But in this case we have to do extra pg_atomic_read_u32 after unsuccessful CAS.  And it causes some small regression.  Especially unpleasant that it's also regression in comparison with master on low clients.

But there is code duplication which would be very nice to evade.  I end up with macros which encapsulates CAS loop.

BEGIN_BUFSTATE_CAS_LOOP(buf);
state |= BM_LOCKED;
END_BUFSTATE_CAS_LOOP(buf);

For me, it simplifies readability a lot.
 
> +                     /* increase refcount */
> +                     state += BUF_REFCOUNT_ONE;
> +
> +                     /* increase usagecount unless already max */
> +                     if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
> +                             state += BUF_USAGECOUNT_ONE;
> +
> +                     /* try to do CAS, exit on success */

Seems like a somewhat obvious comment?

Comment removed.
 
> @@ -1603,15 +1640,22 @@ PinBuffer_Locked(BufferDesc *buf)
>  {

>       Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
>
> -     buf->refcount++;
> -     UnlockBufHdr(buf);
> +     /*
> +      * Since we assume to held buffer header lock, we can update the buffer
> +      * state in a single write operation.
> +      */
> +     state = pg_atomic_read_u32(&buf->state);
> +     state += 1;
> +     state &= ~BM_LOCKED;
> +     pg_atomic_write_u32(&buf->state, state);

Te comment should probably mention that we're releasing the
spinlock. And the += 1 should be a BUF_REFCOUNT_ONE, otherwise it's hard
to understand.

Fixed.
 
> @@ -1646,30 +1690,66 @@ UnpinBuffer(BufferDesc *buf, bool fixOwn
>       ref->refcount--;
>       if (ref->refcount == 0)
>       {

> +
> +             /* Support LockBufferForCleanup() */
> +             if (state & BM_PIN_COUNT_WAITER)
> +             {
> +                     state = LockBufHdr(buf);
> +
> +                     if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
> +                     {
> +                             /* we just released the last pin other than the waiter's */
> +                             int                     wait_backend_pid = buf->wait_backend_pid;
>
> +                             state &= ~(BM_PIN_COUNT_WAITER | BM_LOCKED);
> +                             pg_atomic_write_u32(&buf->state, state);
> +                             ProcSendSignal(wait_backend_pid);
> +                     }
> +                     else
> +                             UnlockBufHdr(buf);
> +             }

I think it's quite confusing to use UnlockBufHdr and direct bit
expressions in one branch.

Thinking about it I also don't think the pg_atomic_write_u32 variant is
correct without adding a write barrier; the other side might not see the
values yet.

I think we can just redefine UnlockBufHdr() to be a
pg_atomic_write_u32() and pg_write_memory_barrier()?

Done.
 
>        * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
> -      * need not bother with the buffer header spinlock.  Even if someone else
> +      * need not bother with the buffer header lock.  Even if someone else
>        * changes the buffer header flags while we're doing this, we assume that
>        * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
>        * old value or the new value, but not random garbage.
>        */

The rest of the comment is outdated, BufFlags isn't a 2 byte value
anymore.

Fixed.
 
> @@ -3078,7 +3168,7 @@ FlushRelationBuffers(Relation rel)
>                                                 localpage,
>                                                 false);
>
> -                             bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
> +                             pg_atomic_fetch_and_u32(&bufHdr->state, ~(BM_DIRTY | BM_JUST_DIRTIED));

Hm, in other places you replaced atomics on local buffers with plain
writes.

Fixed.
 
>                       lsn = XLogSaveBufferForHint(buffer, buffer_std);
>               }
>
> -             LockBufHdr(bufHdr);
> -             Assert(bufHdr->refcount > 0);
> -             if (!(bufHdr->flags & BM_DIRTY))
> +             state = LockBufHdr(bufHdr);
> +
> +             Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
> +
> +             if (!(state & BM_DIRTY))
>               {
>                       dirtied = true;         /* Means "will be dirtied by this action" */

It's again worthwhile to try make this work without taking the lock.
 
Comment there claims.

/*
 * Set the page LSN if we wrote a backup block. We aren't supposed
 * to set this when only holding a share lock but as long as we
 * serialise it somehow we're OK. We choose to set LSN while
 * holding the buffer header lock, which causes any reader of an
 * LSN who holds only a share lock to also obtain a buffer header
 * lock before using PageGetLSN(), which is enforced in
 * BufferGetLSNAtomic().

Thus, buffer header lock is used for write serialization.  I don't think it would be correct to remove the lock here.
 
> -     buf->flags |= BM_IO_IN_PROGRESS;
> -
> -     UnlockBufHdr(buf);
> +     state |= BM_IO_IN_PROGRESS;
> +     state &= ~BM_LOCKED;
> +     pg_atomic_write_u32(&buf->state, state);

How about making UnlockBufHdr() take a new state parameter, and
internally unset BM_LOCKED?

Done.
 
>  /*
> + * Lock buffer header - set BM_LOCKED in buffer state.
> + */
> +uint32
> +LockBufHdr(volatile BufferDesc *desc)
> +{
> +     SpinDelayStatus delayStatus;
> +     uint32                  state;
> +
> +     init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
> +
> +     state = pg_atomic_read_u32(&desc->state);
> +
> +     for (;;)
> +     {
> +             /* wait till lock is free */
> +             while (state & BM_LOCKED)
> +             {
> +                     make_spin_delay(&delayStatus);
> +                     state = pg_atomic_read_u32(&desc->state);
> +                     /* Add exponential backoff? Should seldomly be contended tho. */

Outdated comment.

Fixed.
 
> +/*
> + * Unlock buffer header - unset BM_LOCKED in buffer state.
> + */
> +void
> +UnlockBufHdr(volatile BufferDesc *desc)
> +{
> +     Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
> +
> +     pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
> +}

As suggested above, there's likely no need to use an actual atomic op
here.

Fixed.  Actually, this function is removed now. 

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company 
Вложения

В списке pgsql-hackers по дате отправления:

Предыдущее
От: Jerry Sievers
Дата:
Сообщение: Pg-Logical output pkg; can't install 9.4 and 9.5 on same Wheezy box
Следующее
От: Tom Lane
Дата:
Сообщение: Re: Move PinBuffer and UnpinBuffer to atomics