This is the Synchronized Scan patch.
This patch only implements the simplest Synchronized Scan features; I
removed some of the other features for simplicity of testing.
This patch:
(1) allocates shared memory to store a table of (relation oid,
blocknumber) pairs.
(2) When fetching a new page in a scan, it hashes the Oid of the
relation and stores a hint entry in the shared memory table
(2) When starting a new scan, it reads the shared memory table for the
hint and starts the scan at that location (which is, hopefully, near an
already-existing scan).
This isn't complete. I still plan to add some features and do a lot of
testing, depending on feedback. I am posting it here so that anyone can
test it, and so when I show results I can refer to this specific patch.
Regards,
Jeff Davis
Only in postgresql-8.2.0-ss/src/backend/access/heap: .heapam.c.swp
diff -cr postgresql-8.2.0/src/backend/access/heap/heapam.c postgresql-8.2.0-ss/src/backend/access/heap/heapam.c
*** postgresql-8.2.0/src/backend/access/heap/heapam.c Fri Nov 17 10:00:14 2006
--- postgresql-8.2.0-ss/src/backend/access/heap/heapam.c Sat Dec 9 11:43:26 2006
***************
*** 54,59 ****
--- 54,60 ----
#include "utils/lsyscache.h"
#include "utils/relcache.h"
#include "utils/syscache.h"
+ #include "optimizer/cost.h"
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
***************
*** 65,70 ****
--- 66,162 ----
* ----------------------------------------------------------------
*/
+ static BlockNumber ss_get_startloc(Oid);
+ static int ss_report_loc(Oid,BlockNumber);
+ static int ss_hash_relid(Oid);
+
+ /*
+ * ss_get_startloc:
+ *
+ * This function reads the Sync Scan Hint Table to
+ * find a possible location for an already running
+ * sequential scan on this relation.
+ *
+ * By starting a sequential scan near the location
+ * of an already running scan, we improve the chance
+ * of finding pages in cache.
+ *
+ * This does some basic sanity checking, but the
+ * result is not guaranteed to be within the bounds
+ * of the relation size.
+ */
+ static BlockNumber ss_get_startloc(Oid relid)
+ {
+ char *shm;
+ ss_scan_loc_t scanloc;
+ int offset;
+ bool found;
+
+ offset = ss_hash_relid(relid)*sizeof(ss_scan_loc_t);
+ shm = (char *)ShmemInitStruct("Sync Scan Hint Table",
+ SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t),&found);
+ if(!found)
+ return 0;
+
+ scanloc = *((ss_scan_loc_t*)(shm+offset));
+
+ /*TODO:
+ * If the relid matches this relid, return the hint minus
+ * a fraction of the effective_cache_size. By starting at
+ * an earlier offset, it's likely that all of the blocks
+ * will already be cached, and the scan will quickly
+ * catch up to the head.
+ * If the relid does not match, the hint does not apply, so
+ * return 0.
+ */
+ if(scanloc.relid == relid)
+ return scanloc.loc;
+ else
+ return 0;
+ }
+
+ /*
+ * This is a simplistic function to hash
+ * the Oid of the relation for placement in
+ * the Sync Scan Hint Table
+ */
+ static int ss_hash_relid(Oid relid)
+ {
+ return relid % SYNC_SCAN_TABLE_SIZE;
+ }
+
+ /*
+ * ss_report_loc:
+ *
+ * Writes an entry in the Sync Scan Hint Table
+ * of the form (relid,blocknumber). This will
+ * overwrite any existing entry that may collide
+ * with this entry in the table.
+ *
+ * No locking is performed here. The data is
+ * considered to be untrusted when read by
+ * ss_get_startloc(), and it's callers.
+ */
+ static int ss_report_loc(Oid relid, BlockNumber loc)
+ {
+ char *shm;
+ ss_scan_loc_t scanloc;
+ int offset;
+ bool found;
+
+ scanloc.relid = relid;
+ scanloc.loc = loc;
+ offset = ss_hash_relid(relid);
+ shm = ShmemInitStruct("Sync Scan Hint Table",
+ SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t), &found);
+ if(!found)
+ return 0;
+
+ *((ss_scan_loc_t*)(shm+offset)) = scanloc;
+
+ return 0;
+ }
+
/* ----------------
* initscan - scan code common to heap_beginscan and heap_rescan
* ----------------
***************
*** 81,86 ****
--- 173,191 ----
*/
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
+ /*
+ * It's important to check the results retrieved from
+ * the ss_get_startloc call. There are some sanity checks
+ * within the function, but we need to make sure that the
+ * hint value is not larger than the number of blocks in
+ * the relation, for instance in the case of a VACUUM FULL
+ * between the time the hint was placed and now.
+ */
+ scan->rs_start_page = ss_get_startloc(RelationGetRelid(scan->rs_rd));
+ if(scan->rs_start_page >= scan->rs_nblocks ||
+ scan->rs_start_page < 0)
+ scan->rs_start_page = 0;
+
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 223,229 ****
tuple->t_data = NULL;
return;
}
! page = 0; /* first page */
heapgetpage(scan, page);
lineoff = FirstOffsetNumber; /* first offnum */
scan->rs_inited = true;
--- 328,342 ----
tuple->t_data = NULL;
return;
}
! /*
! * start the scan at the location that we chose
! * in ss_get_startloc()
! */
! page = scan->rs_start_page;
! /*TODO:
! * only report if page - start > effective_cache_size*start_offset
! */
! ss_report_loc(RelationGetRelid(scan->rs_rd),page);
heapgetpage(scan, page);
lineoff = FirstOffsetNumber; /* first offnum */
scan->rs_inited = true;
***************
*** 364,378 ****
}
/*
! * if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
/*
! * return NULL if we've exhausted all the pages
*/
! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
--- 477,508 ----
}
/*
! * If we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
+ *
+ * For the forward scan, we need to wrap around to the beginning
+ * of the relation file if we reach the end.
*/
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ if(backward)
+ page--;
+ else
+ page = (page + 1) % (scan->rs_nblocks);
+
+ /*TODO:
+ * only report if page - start > effective_cache_size*start_offset
+ */
+ ss_report_loc(RelationGetRelid(scan->rs_rd),page);
+
/*
! * Return NULL if we've exhausted all the pages.
! * For reverse scans, that means we've reached 0. For
! * forward scans, that means we've reached the page on
! * which we started.
*/
! if ((backward && (page == 0)) ||
! ((page%(scan->rs_nblocks)) == scan->rs_start_page))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
***************
*** 383,390 ****
return;
}
- page = backward ? (page - 1) : (page + 1);
-
heapgetpage(scan, page);
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
--- 513,518 ----
***************
*** 450,456 ****
tuple->t_data = NULL;
return;
}
! page = 0; /* first page */
heapgetpage(scan, page);
lineindex = 0;
scan->rs_inited = true;
--- 578,592 ----
tuple->t_data = NULL;
return;
}
! /*
! * start the scan at the location that we chose
! * in ss_get_startloc()
! */
! page = scan->rs_start_page;
! /*TODO:
! * only report if page - start > effective_cache_size*start_offset
! */
! ss_report_loc(RelationGetRelid(scan->rs_rd),page);
heapgetpage(scan, page);
lineindex = 0;
scan->rs_inited = true;
***************
*** 585,598 ****
}
/*
! * if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
/*
! * return NULL if we've exhausted all the pages
*/
! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
--- 721,750 ----
}
/*
! * If we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
+ *
+ * For the forward scan, we need to wrap around to the beginning
+ * of the relation file if we reach the end.
+ */
+ if(backward)
+ page--;
+ else
+ page = (page + 1) % (scan->rs_nblocks);
+
+ /*TODO:
+ * only report if page - start > effective_cache_size*start_offset
*/
+ ss_report_loc(RelationGetRelid(scan->rs_rd),page);
/*
! * Return NULL if we've exhausted all the pages.
! * For reverse scans, that means we've reached 0. For
! * forward scans, that means we've reached the page on
! * which we started.
*/
! if ((backward && (page == 0)) ||
! ((page%(scan->rs_nblocks)) == scan->rs_start_page))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
***************
*** 603,609 ****
return;
}
- page = backward ? (page - 1) : (page + 1);
heapgetpage(scan, page);
dp = (Page) BufferGetPage(scan->rs_cbuf);
--- 755,760 ----
***************
*** 616,621 ****
--- 767,783 ----
}
}
+ /*
+ * SyncScanShmemSize:
+ *
+ * Called by CreateSharedMemoryAndSemaphores()
+ * to find out how much room the Sync Scan Hint
+ * Table will need to occupy.
+ */
+ Size SyncScanShmemSize(void)
+ {
+ return SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t);
+ }
#if defined(DISABLE_COMPLEX_MACRO)
/*
diff -cr postgresql-8.2.0/src/backend/storage/ipc/ipci.c postgresql-8.2.0-ss/src/backend/storage/ipc/ipci.c
*** postgresql-8.2.0/src/backend/storage/ipc/ipci.c Sun Oct 15 15:04:07 2006
--- postgresql-8.2.0-ss/src/backend/storage/ipc/ipci.c Mon Dec 4 23:47:43 2006
***************
*** 19,24 ****
--- 19,25 ----
#include "access/nbtree.h"
#include "access/subtrans.h"
#include "access/twophase.h"
+ #include "access/heapam.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
***************
*** 110,115 ****
--- 111,117 ----
size = add_size(size, FreeSpaceShmemSize());
size = add_size(size, BgWriterShmemSize());
size = add_size(size, BTreeShmemSize());
+ size = add_size(size, SyncScanShmemSize());
#ifdef EXEC_BACKEND
size = add_size(size, ShmemBackendArraySize());
#endif
diff -cr postgresql-8.2.0/src/include/access/heapam.h postgresql-8.2.0-ss/src/include/access/heapam.h
*** postgresql-8.2.0/src/include/access/heapam.h Sun Nov 5 14:42:10 2006
--- postgresql-8.2.0-ss/src/include/access/heapam.h Sat Dec 9 11:33:35 2006
***************
*** 24,29 ****
--- 24,48 ----
#include "storage/lmgr.h"
#include "utils/rel.h"
#include "utils/tqual.h"
+ #include <sys/shm.h>
+
+ /*
+ * Size of the Sync Scan Hint Table.
+ */
+ #define SYNC_SCAN_TABLE_SIZE 1024
+
+ #define SYNC_SCAN_THRESHOLD 2
+ #define SYNC_SCAN_START_OFFSET 0
+ /*
+ * Structure of an entry in the
+ * Sync Scan Hint Table.
+ */
+ typedef struct {
+ Oid relid;
+ BlockNumber loc;
+ } ss_scan_loc_t;
+
+ extern Size SyncScanShmemSize(void);
/* ----------------
* fastgetattr
diff -cr postgresql-8.2.0/src/include/access/relscan.h postgresql-8.2.0-ss/src/include/access/relscan.h
*** postgresql-8.2.0/src/include/access/relscan.h Tue Oct 3 17:30:07 2006
--- postgresql-8.2.0-ss/src/include/access/relscan.h Wed Dec 6 22:31:07 2006
***************
*** 33,38 ****
--- 33,39 ----
bool rs_inited; /* false = scan not init'd yet */
HeapTupleData rs_ctup; /* current tuple in scan, if any */
BlockNumber rs_cblock; /* current block # in scan, if any */
+ BlockNumber rs_start_page; /* page where this scan began */
Buffer rs_cbuf; /* current buffer in scan, if any */
/* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
ItemPointerData rs_mctid; /* marked scan position, if any */