Bloom index contrib module
authorTeodor Sigaev <teodor@sigaev.ru>
Fri, 1 Apr 2016 13:42:24 +0000 (16:42 +0300)
committerTeodor Sigaev <teodor@sigaev.ru>
Fri, 1 Apr 2016 13:42:24 +0000 (16:42 +0300)
Module provides new access method. It is actually a simple Bloom filter
implemented as pgsql's index. It could give some benefits on search
with large number of columns.

Module is a single way to test generic WAL interface committed earlier.

Author: Teodor Sigaev, Alexander Korotkov
Reviewers: Aleksander Alekseev, Michael Paquier, Jim Nasby

18 files changed:
contrib/Makefile
contrib/bloom/.gitignore [new file with mode: 0644]
contrib/bloom/Makefile [new file with mode: 0644]
contrib/bloom/blcost.c [new file with mode: 0644]
contrib/bloom/blinsert.c [new file with mode: 0644]
contrib/bloom/bloom--1.0.sql [new file with mode: 0644]
contrib/bloom/bloom.control [new file with mode: 0644]
contrib/bloom/bloom.h [new file with mode: 0644]
contrib/bloom/blscan.c [new file with mode: 0644]
contrib/bloom/blutils.c [new file with mode: 0644]
contrib/bloom/blvacuum.c [new file with mode: 0644]
contrib/bloom/blvalidate.c [new file with mode: 0644]
contrib/bloom/expected/bloom.out [new file with mode: 0644]
contrib/bloom/sql/bloom.sql [new file with mode: 0644]
contrib/bloom/t/001_wal.pl [new file with mode: 0644]
doc/src/sgml/bloom.sgml [new file with mode: 0644]
doc/src/sgml/contrib.sgml
doc/src/sgml/filelist.sgml

index d12dd6379b50c7384615a39d48916052369d2cba..25263c0be9494a5ee7943190088e184e4ebcb3cd 100644 (file)
@@ -8,6 +8,7 @@ SUBDIRS = \
        adminpack   \
        auth_delay  \
        auto_explain    \
+       bloom       \
        btree_gin   \
        btree_gist  \
        chkpass     \
diff --git a/contrib/bloom/.gitignore b/contrib/bloom/.gitignore
new file mode 100644 (file)
index 0000000..5dcb3ff
--- /dev/null
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/contrib/bloom/Makefile b/contrib/bloom/Makefile
new file mode 100644 (file)
index 0000000..13bd397
--- /dev/null
@@ -0,0 +1,24 @@
+# contrib/bloom/Makefile
+
+MODULE_big = bloom
+OBJS = blcost.o blinsert.o blscan.o blutils.o blvacuum.o blvalidate.o $(WIN32RES)
+
+EXTENSION = bloom
+DATA = bloom--1.0.sql
+PGFILEDESC = "bloom access method - signature file based index"
+
+REGRESS = bloom
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/bloom
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+wal-check: temp-install
+   $(prove_check)
diff --git a/contrib/bloom/blcost.c b/contrib/bloom/blcost.c
new file mode 100644 (file)
index 0000000..9897898
--- /dev/null
@@ -0,0 +1,48 @@
+/*-------------------------------------------------------------------------
+ *
+ * blcost.c
+ *     Cost estimate function for bloom indexes.
+ *
+ * Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *   contrib/bloom/blcost.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "optimizer/cost.h"
+#include "utils/selfuncs.h"
+
+#include "bloom.h"
+
+/*
+ * Estimate cost of bloom index scan.
+ */
+void
+blcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+              Cost *indexStartupCost, Cost *indexTotalCost,
+              Selectivity *indexSelectivity, double *indexCorrelation)
+{
+   IndexOptInfo *index = path->indexinfo;
+   List       *qinfos;
+   GenericCosts costs;
+
+   /* Do preliminary analysis of indexquals */
+   qinfos = deconstruct_indexquals(path);
+
+   MemSet(&costs, 0, sizeof(costs));
+
+   /* We have to visit all index tuples anyway */
+   costs.numIndexTuples = index->tuples;
+
+   /* Use generic estimate */
+   genericcostestimate(root, path, loop_count, qinfos, &costs);
+
+   *indexStartupCost = costs.indexStartupCost;
+   *indexTotalCost = costs.indexTotalCost;
+   *indexSelectivity = costs.indexSelectivity;
+   *indexCorrelation = costs.indexCorrelation;
+}
diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
new file mode 100644 (file)
index 0000000..9e66780
--- /dev/null
@@ -0,0 +1,313 @@
+/*-------------------------------------------------------------------------
+ *
+ * blinsert.c
+ *     Bloom index build and insert functions.
+ *
+ * Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *   contrib/bloom/blinsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/generic_xlog.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+#include "bloom.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * State of bloom index build.  We accumulate one page data here before
+ * flushing it to buffer manager.
+ */
+typedef struct
+{
+   BloomState  blstate;        /* bloom index state */
+   MemoryContext tmpCtx;       /* temporary memory context reset after
+                                * each tuple */
+   char        data[BLCKSZ];   /* cached page */
+   int64       count;          /* number of tuples in cached page */
+}  BloomBuildState;
+
+/*
+ * Flush page cached in BloomBuildState.
+ */
+static void
+flushCachedPage(Relation index, BloomBuildState *buildstate)
+{
+   Page        page;
+   Buffer      buffer = BloomNewBuffer(index);
+   GenericXLogState *state;
+
+   state = GenericXLogStart(index);
+   page = GenericXLogRegister(state, buffer, true);
+   memcpy(page, buildstate->data, BLCKSZ);
+   GenericXLogFinish(state);
+   UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * (Re)initialize cached page in BloomBuildState.
+ */
+static void
+initCachedPage(BloomBuildState *buildstate)
+{
+   memset(buildstate->data, 0, BLCKSZ);
+   BloomInitPage(buildstate->data, 0);
+   buildstate->count = 0;
+}
+
+/*
+ * Per-tuple callback from IndexBuildHeapScan.
+ */
+static void
+bloomBuildCallback(Relation index, HeapTuple htup, Datum *values,
+                  bool *isnull, bool tupleIsAlive, void *state)
+{
+   BloomBuildState *buildstate = (BloomBuildState *) state;
+   MemoryContext oldCtx;
+   BloomTuple *itup;
+
+   oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+
+   itup = BloomFormTuple(&buildstate->blstate, &htup->t_self, values, isnull);
+
+   /* Try to add next item to cached page */
+   if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup))
+   {
+       /* Next item was added successfully */
+       buildstate->count++;
+   }
+   else
+   {
+       /* Cached page is full, flush it out and make a new one */
+       flushCachedPage(index, buildstate);
+
+       CHECK_FOR_INTERRUPTS();
+
+       initCachedPage(buildstate);
+
+       if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup) == false)
+       {
+           /* We shouldn't be here since we're inserting to the empty page */
+           elog(ERROR, "can not add new tuple");
+       }
+   }
+
+   MemoryContextSwitchTo(oldCtx);
+   MemoryContextReset(buildstate->tmpCtx);
+}
+
+/*
+ * Build a new bloom index.
+ */
+IndexBuildResult *
+blbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+   IndexBuildResult *result;
+   double      reltuples;
+   BloomBuildState buildstate;
+
+   if (RelationGetNumberOfBlocks(index) != 0)
+       elog(ERROR, "index \"%s\" already contains data",
+            RelationGetRelationName(index));
+
+   /* Initialize the meta page */
+   BloomInitMetapage(index);
+
+   /* Initialize the bloom build state */
+   memset(&buildstate, 0, sizeof(buildstate));
+   initBloomState(&buildstate.blstate, index);
+   buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
+                                             "Bloom build temporary context",
+                                             ALLOCSET_DEFAULT_MINSIZE,
+                                             ALLOCSET_DEFAULT_INITSIZE,
+                                             ALLOCSET_DEFAULT_MAXSIZE);
+   initCachedPage(&buildstate);
+
+   /* Do the heap scan */
+   reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
+                                  bloomBuildCallback, (void *) &buildstate);
+
+   /*
+    * There are could be some items in cached page.  Flush this page
+    * if needed.
+    */
+   if (buildstate.count > 0)
+       flushCachedPage(index, &buildstate);
+
+   MemoryContextDelete(buildstate.tmpCtx);
+
+   result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+   result->heap_tuples = result->index_tuples = reltuples;
+
+   return result;
+}
+
+/*
+ * Build an empty bloom index in the initialization fork.
+ */
+void
+blbuildempty(Relation index)
+{
+   if (RelationGetNumberOfBlocks(index) != 0)
+       elog(ERROR, "index \"%s\" already contains data",
+            RelationGetRelationName(index));
+
+   /* Initialize the meta page */
+   BloomInitMetapage(index);
+}
+
+/*
+ * Insert new tuple to the bloom index.
+ */
+bool
+blinsert(Relation index, Datum *values, bool *isnull,
+        ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique)
+{
+   BloomState  blstate;
+   BloomTuple *itup;
+   MemoryContext oldCtx;
+   MemoryContext insertCtx;
+   BloomMetaPageData *metaData;
+   Buffer      buffer,
+               metaBuffer;
+   Page        page,
+               metaPage;
+   BlockNumber blkno = InvalidBlockNumber;
+   OffsetNumber nStart;
+   GenericXLogState *state;
+
+   insertCtx = AllocSetContextCreate(CurrentMemoryContext,
+                                     "Bloom insert temporary context",
+                                     ALLOCSET_DEFAULT_MINSIZE,
+                                     ALLOCSET_DEFAULT_INITSIZE,
+                                     ALLOCSET_DEFAULT_MAXSIZE);
+
+   oldCtx = MemoryContextSwitchTo(insertCtx);
+
+   initBloomState(&blstate, index);
+   itup = BloomFormTuple(&blstate, ht_ctid, values, isnull);
+
+   /*
+    * At first, try to insert new tuple to the first page in notFullPage
+    * array.  If success we don't need to modify the meta page.
+    */
+   metaBuffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO);
+   LockBuffer(metaBuffer, BUFFER_LOCK_SHARE);
+   metaData = BloomPageGetMeta(BufferGetPage(metaBuffer));
+
+   if (metaData->nEnd > metaData->nStart)
+   {
+       Page        page;
+
+       blkno = metaData->notFullPage[metaData->nStart];
+
+       Assert(blkno != InvalidBlockNumber);
+       LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK);
+
+       buffer = ReadBuffer(index, blkno);
+       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+       state = GenericXLogStart(index);
+       page = GenericXLogRegister(state, buffer, false);
+
+       if (BloomPageAddItem(&blstate, page, itup))
+       {
+           GenericXLogFinish(state);
+           UnlockReleaseBuffer(buffer);
+           ReleaseBuffer(metaBuffer);
+           MemoryContextSwitchTo(oldCtx);
+           MemoryContextDelete(insertCtx);
+           return false;
+       }
+       else
+       {
+           GenericXLogAbort(state);
+           UnlockReleaseBuffer(buffer);
+       }
+   }
+   else
+   {
+       /* First page in notFullPage isn't suitable */
+       LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK);
+   }
+
+   /*
+    * Try other pages in notFullPage array.  We will have to change nStart in
+    * metapage.  Thus, grab exclusive lock on metapage.
+    */
+   LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE);
+
+   state = GenericXLogStart(index);
+   metaPage = GenericXLogRegister(state, metaBuffer, false);
+   metaData = BloomPageGetMeta(metaPage);
+
+   /*
+    * Iterate over notFullPage array.  Skip page we already tried first.
+    */
+   nStart = metaData->nStart;
+   if (metaData->nEnd > nStart &&
+       blkno == metaData->notFullPage[nStart])
+       nStart++;
+
+   while (metaData->nEnd > nStart)
+   {
+       blkno = metaData->notFullPage[nStart];
+       Assert(blkno != InvalidBlockNumber);
+
+       buffer = ReadBuffer(index, blkno);
+       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+       page = GenericXLogRegister(state, buffer, false);
+
+       if (BloomPageAddItem(&blstate, page, itup))
+       {
+           metaData->nStart = nStart;
+           GenericXLogFinish(state);
+           UnlockReleaseBuffer(buffer);
+           UnlockReleaseBuffer(metaBuffer);
+           MemoryContextSwitchTo(oldCtx);
+           MemoryContextDelete(insertCtx);
+           return false;
+       }
+       else
+       {
+           GenericXLogUnregister(state, buffer);
+           UnlockReleaseBuffer(buffer);
+       }
+       nStart++;
+   }
+
+   GenericXLogAbort(state);
+
+   /*
+    * Didn't find place to insert in notFullPage array.  Allocate new page.
+    */
+   buffer = BloomNewBuffer(index);
+
+   state = GenericXLogStart(index);
+   metaPage = GenericXLogRegister(state, metaBuffer, false);
+   metaData = BloomPageGetMeta(metaPage);
+   page = GenericXLogRegister(state, buffer, true);
+   BloomInitPage(page, 0);
+   BloomPageAddItem(&blstate, page, itup);
+
+   metaData->nStart = 0;
+   metaData->nEnd = 1;
+   metaData->notFullPage[0] = BufferGetBlockNumber(buffer);
+
+   GenericXLogFinish(state);
+
+   UnlockReleaseBuffer(buffer);
+   UnlockReleaseBuffer(metaBuffer);
+
+   return false;
+}
diff --git a/contrib/bloom/bloom--1.0.sql b/contrib/bloom/bloom--1.0.sql
new file mode 100644 (file)
index 0000000..7fa7513
--- /dev/null
@@ -0,0 +1,19 @@
+CREATE OR REPLACE FUNCTION blhandler(internal)
+RETURNS index_am_handler
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+-- Access method
+CREATE ACCESS METHOD bloom TYPE INDEX HANDLER blhandler;
+
+-- Opclasses
+
+CREATE OPERATOR CLASS int4_ops
+DEFAULT FOR TYPE int4 USING bloom AS
+   OPERATOR    1   =(int4, int4),
+   FUNCTION    1   hashint4(int4);
+
+CREATE OPERATOR CLASS text_ops
+DEFAULT FOR TYPE text USING bloom AS
+   OPERATOR    1   =(text, text),
+   FUNCTION    1   hashtext(text);
diff --git a/contrib/bloom/bloom.control b/contrib/bloom/bloom.control
new file mode 100644 (file)
index 0000000..4d4124b
--- /dev/null
@@ -0,0 +1,5 @@
+# bloom extension
+comment = 'bloom access method - signature file based index'
+default_version = '1.0'
+module_pathname = '$libdir/bloom'
+relocatable = true
diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h
new file mode 100644 (file)
index 0000000..50bf99b
--- /dev/null
@@ -0,0 +1,178 @@
+/*-------------------------------------------------------------------------
+ *
+ * bloom.h
+ *   Header for bloom index.
+ *
+ * Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *   contrib/bloom/bloom.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _BLOOM_H_
+#define _BLOOM_H_
+
+#include "access/amapi.h"
+#include "access/generic_xlog.h"
+#include "access/itup.h"
+#include "access/xlog.h"
+#include "nodes/relation.h"
+#include "fmgr.h"
+
+/* Support procedures numbers */
+#define BLOOM_HASH_PROC            1
+#define BLOOM_NPROC                1
+
+/* Scan strategies */
+#define BLOOM_EQUAL_STRATEGY   1
+#define BLOOM_NSTRATEGIES      1
+
+/* Opaque for bloom pages */
+typedef struct BloomPageOpaqueData
+{
+   OffsetNumber maxoff;
+   uint16      flags;
+}  BloomPageOpaqueData;
+
+typedef BloomPageOpaqueData *BloomPageOpaque;
+
+/* Bloom page flags */
+#define BLOOM_META     (1<<0)
+#define BLOOM_DELETED  (2<<0)
+
+/* Macros for accessing bloom page structures */
+#define BloomPageGetOpaque(page) ((BloomPageOpaque) PageGetSpecialPointer(page))
+#define BloomPageGetMaxOffset(page) (BloomPageGetOpaque(page)->maxoff)
+#define BloomPageIsMeta(page) (BloomPageGetOpaque(page)->flags & BLOOM_META)
+#define BloomPageIsDeleted(page) (BloomPageGetOpaque(page)->flags & BLOOM_DELETED)
+#define BloomPageSetDeleted(page) (BloomPageGetOpaque(page)->flags |= BLOOM_DELETED)
+#define BloomPageSetNonDeleted(page) (BloomPageGetOpaque(page)->flags &= ~BLOOM_DELETED)
+#define BloomPageGetData(page)     ((BloomTuple *)PageGetContents(page))
+#define BloomPageGetTuple(state, page, offset) \
+   ((BloomTuple *)(PageGetContents(page) \
+       + (state)->sizeOfBloomTuple * ((offset) - 1)))
+#define BloomPageGetNextTuple(state, tuple) \
+   ((BloomTuple *)((Pointer)(tuple) + (state)->sizeOfBloomTuple))
+
+/* Preserved page numbers */
+#define BLOOM_METAPAGE_BLKNO   (0)
+#define BLOOM_HEAD_BLKNO       (1)     /* first data page */
+
+/* Bloom index options */
+typedef struct BloomOptions
+{
+   int32       vl_len_;        /* varlena header (do not touch directly!) */
+   int         bloomLength;    /* length of signature in uint16 */
+   int         bitSize[INDEX_MAX_KEYS];        /* signature bits per index
+                                                * key */
+}  BloomOptions;
+
+/*
+ * FreeBlockNumberArray - array of block numbers sized so that metadata fill
+ * all space in metapage.
+ */
+typedef BlockNumber FreeBlockNumberArray[
+                                        MAXALIGN_DOWN(
+       BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(BloomPageOpaqueData))
+      - MAXALIGN(sizeof(uint16) * 2 + sizeof(uint32) + sizeof(BloomOptions))
+                                                      ) / sizeof(BlockNumber)
+];
+
+/* Metadata of bloom index */
+typedef struct BloomMetaPageData
+{
+   uint32      magickNumber;
+   uint16      nStart;
+   uint16      nEnd;
+   BloomOptions opts;
+   FreeBlockNumberArray notFullPage;
+}  BloomMetaPageData;
+
+/* Magic number to distinguish bloom pages among anothers */
+#define BLOOM_MAGICK_NUMBER (0xDBAC0DED)
+
+/* Number of blocks numbers fit in BloomMetaPageData */
+#define BloomMetaBlockN        (sizeof(FreeBlockNumberArray) / sizeof(BlockNumber))
+
+#define BloomPageGetMeta(page) ((BloomMetaPageData *) PageGetContents(page))
+
+typedef struct BloomState
+{
+   FmgrInfo    hashFn[INDEX_MAX_KEYS];
+   BloomOptions *opts;         /* stored in rd_amcache and defined at
+                                * creation time */
+   int32       nColumns;
+
+   /*
+    * sizeOfBloomTuple is index's specific, and it depends on reloptions, so
+    * precompute it
+    */
+   int32       sizeOfBloomTuple;
+}  BloomState;
+
+#define BloomPageGetFreeSpace(state, page) \
+   (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \
+       - BloomPageGetMaxOffset(page) * (state)->sizeOfBloomTuple \
+       - MAXALIGN(sizeof(BloomPageOpaqueData)))
+
+/*
+ * Tuples are very different from all other relations
+ */
+typedef uint16 SignType;
+
+typedef struct BloomTuple
+{
+   ItemPointerData heapPtr;
+   SignType    sign[1];
+}  BloomTuple;
+
+#define BLOOMTUPLEHDRSZ offsetof(BloomTuple, sign)
+
+/* Opaque data structure for bloom index scan */
+typedef struct BloomScanOpaqueData
+{
+   SignType   *sign;           /* Scan signature */
+   BloomState  state;
+}  BloomScanOpaqueData;
+
+typedef BloomScanOpaqueData *BloomScanOpaque;
+
+/* blutils.c */
+extern void _PG_init(void);
+extern Datum blhandler(PG_FUNCTION_ARGS);
+extern void initBloomState(BloomState * state, Relation index);
+extern void BloomInitMetapage(Relation index);
+extern void BloomInitPage(Page page, uint16 flags);
+extern Buffer BloomNewBuffer(Relation index);
+extern void signValue(BloomState * state, SignType * sign, Datum value, int attno);
+extern BloomTuple *BloomFormTuple(BloomState * state, ItemPointer iptr, Datum *values, bool *isnull);
+extern bool BloomPageAddItem(BloomState * state, Page page, BloomTuple * tuple);
+
+/* blvalidate.c */
+extern bool blvalidate(Oid opclassoid);
+
+/* index access method interface functions */
+extern bool blinsert(Relation index, Datum *values, bool *isnull,
+        ItemPointer ht_ctid, Relation heapRel,
+        IndexUniqueCheck checkUnique);
+extern IndexScanDesc blbeginscan(Relation r, int nkeys, int norderbys);
+extern int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
+extern void blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+        ScanKey orderbys, int norderbys);
+extern void blendscan(IndexScanDesc scan);
+extern IndexBuildResult *blbuild(Relation heap, Relation index,
+       struct IndexInfo *indexInfo);
+extern void blbuildempty(Relation index);
+extern IndexBulkDeleteResult *blbulkdelete(IndexVacuumInfo *info,
+            IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback,
+            void *callback_state);
+extern IndexBulkDeleteResult *blvacuumcleanup(IndexVacuumInfo *info,
+               IndexBulkDeleteResult *stats);
+extern bytea *bloptions(Datum reloptions, bool validate);
+extern void blcostestimate(PlannerInfo *root, IndexPath *path,
+              double loop_count, Cost *indexStartupCost,
+              Cost *indexTotalCost, Selectivity *indexSelectivity,
+              double *indexCorrelation);
+
+#endif
diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c
new file mode 100644 (file)
index 0000000..d156e88
--- /dev/null
@@ -0,0 +1,175 @@
+/*-------------------------------------------------------------------------
+ *
+ * blscan.c
+ *     Bloom index scan functions.
+ *
+ * Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *   contrib/bloom/blscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "pgstat.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+#include "bloom.h"
+
+/*
+ * Begin scan of bloom index.
+ */
+IndexScanDesc
+blbeginscan(Relation r, int nkeys, int norderbys)
+{
+   IndexScanDesc scan;
+
+   scan = RelationGetIndexScan(r, nkeys, norderbys);
+
+   return scan;
+}
+
+/*
+ * Rescan a bloom index.
+ */
+void
+blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+        ScanKey orderbys, int norderbys)
+{
+   BloomScanOpaque so;
+
+   so = (BloomScanOpaque) scan->opaque;
+
+   if (so == NULL)
+   {
+       /* if called from blbeginscan */
+       so = (BloomScanOpaque) palloc(sizeof(BloomScanOpaqueData));
+       initBloomState(&so->state, scan->indexRelation);
+       scan->opaque = so;
+
+   }
+   else
+   {
+       if (so->sign)
+           pfree(so->sign);
+   }
+   so->sign = NULL;
+
+   if (scankey && scan->numberOfKeys > 0)
+   {
+       memmove(scan->keyData, scankey,
+               scan->numberOfKeys * sizeof(ScanKeyData));
+   }
+}
+
+/*
+ * End scan of bloom index.
+ */
+void
+blendscan(IndexScanDesc scan)
+{
+   BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+
+   if (so->sign)
+       pfree(so->sign);
+   so->sign = NULL;
+}
+
+/*
+ * Insert all matching tuples into to a bitmap.
+ */
+int64
+blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+   int64       ntids = 0;
+   BlockNumber blkno = BLOOM_HEAD_BLKNO,
+               npages;
+   int         i;
+   BufferAccessStrategy bas;
+   BloomScanOpaque so = (BloomScanOpaque) scan->opaque;
+
+   if (so->sign == NULL && scan->numberOfKeys > 0)
+   {
+       /* New search: have to calculate search signature */
+       ScanKey     skey = scan->keyData;
+
+       so->sign = palloc0(sizeof(SignType) * so->state.opts->bloomLength);
+
+       for (i = 0; i < scan->numberOfKeys; i++)
+       {
+           /*
+            * Assume bloom-indexable operators to be strict, so nothing could
+            * be found for NULL key.
+            */
+           if (skey->sk_flags & SK_ISNULL)
+           {
+               pfree(so->sign);
+               so->sign = NULL;
+               return 0;
+           }
+
+           /* Add next value to the signature */
+           signValue(&so->state, so->sign, skey->sk_argument,
+                     skey->sk_attno - 1);
+
+           skey++;
+       }
+   }
+
+   /*
+    * We're going to read the whole index. This is why we use appropriate
+    * buffer access strategy.
+    */
+   bas = GetAccessStrategy(BAS_BULKREAD);
+   npages = RelationGetNumberOfBlocks(scan->indexRelation);
+
+   for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
+   {
+       Buffer      buffer;
+       Page        page;
+
+       buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM,
+                                   blkno, RBM_NORMAL, bas);
+
+       LockBuffer(buffer, BUFFER_LOCK_SHARE);
+       page = BufferGetPage(buffer);
+
+       if (!BloomPageIsDeleted(page))
+       {
+           OffsetNumber offset,
+                       maxOffset = BloomPageGetMaxOffset(page);
+
+           for (offset = 1; offset <= maxOffset; offset++)
+           {
+               BloomTuple *itup = BloomPageGetTuple(&so->state, page, offset);
+               bool        res = true;
+
+               /* Check index signature with scan signature */
+               for (i = 0; res && i < so->state.opts->bloomLength; i++)
+               {
+                   if ((itup->sign[i] & so->sign[i]) != so->sign[i])
+                       res = false;
+               }
+
+               /* Add matching tuples to bitmap */
+               if (res)
+               {
+                   tbm_add_tuples(tbm, &itup->heapPtr, 1, true);
+                   ntids++;
+               }
+           }
+       }
+
+       UnlockReleaseBuffer(buffer);
+       CHECK_FOR_INTERRUPTS();
+   }
+   FreeAccessStrategy(bas);
+
+   return ntids;
+}
diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
new file mode 100644 (file)
index 0000000..b86f51f
--- /dev/null
@@ -0,0 +1,463 @@
+/*-------------------------------------------------------------------------
+ *
+ * blutils.c
+ *     Bloom index utilities.
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1990-1993, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *   contrib/bloom/blutils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amapi.h"
+#include "access/generic_xlog.h"
+#include "catalog/index.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "utils/memutils.h"
+#include "access/reloptions.h"
+#include "storage/freespace.h"
+#include "storage/indexfsm.h"
+
+#include "bloom.h"
+
+/* Signature dealing macros */
+#define BITSIGNTYPE (BITS_PER_BYTE * sizeof(SignType))
+#define GETWORD(x,i) ( *( (SignType*)(x) + (int)( (i) / BITSIGNTYPE ) ) )
+#define CLRBIT(x,i)   GETWORD(x,i) &= ~( 0x01 << ( (i) % BITSIGNTYPE ) )
+#define SETBIT(x,i)   GETWORD(x,i) |=  ( 0x01 << ( (i) % BITSIGNTYPE ) )
+#define GETBIT(x,i) ( (GETWORD(x,i) >> ( (i) % BITSIGNTYPE )) & 0x01 )
+
+PG_FUNCTION_INFO_V1(blhandler);
+
+/* Kind of relation optioms for bloom index */
+static relopt_kind bl_relopt_kind;
+
+static int32 myRand();
+static void mySrand(uint32 seed);
+
+/*
+ * Module initialize function: initilized relation options.
+ */
+void
+_PG_init(void)
+{
+   int         i;
+   char        buf[16];
+
+   bl_relopt_kind = add_reloption_kind();
+
+   add_int_reloption(bl_relopt_kind, "length",
+                     "Length of signature in uint16 type", 5, 1, 256);
+
+   for (i = 0; i < INDEX_MAX_KEYS; i++)
+   {
+       snprintf(buf, 16, "col%d", i + 1);
+       add_int_reloption(bl_relopt_kind, buf,
+                     "Number of bits for corresponding column", 2, 1, 2048);
+   }
+}
+
+/*
+ * Bloom handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+blhandler(PG_FUNCTION_ARGS)
+{
+   IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+   amroutine->amstrategies = 1;
+   amroutine->amsupport = 1;
+   amroutine->amcanorder = false;
+   amroutine->amcanorderbyop = false;
+   amroutine->amcanbackward = false;
+   amroutine->amcanunique = false;
+   amroutine->amcanmulticol = true;
+   amroutine->amoptionalkey = true;
+   amroutine->amsearcharray = false;
+   amroutine->amsearchnulls = false;
+   amroutine->amstorage = false;
+   amroutine->amclusterable = false;
+   amroutine->ampredlocks = false;
+   amroutine->amkeytype = 0;
+
+   amroutine->aminsert = blinsert;
+   amroutine->ambeginscan = blbeginscan;
+   amroutine->amgettuple = NULL;
+   amroutine->amgetbitmap = blgetbitmap;
+   amroutine->amrescan = blrescan;
+   amroutine->amendscan = blendscan;
+   amroutine->ammarkpos = NULL;
+   amroutine->amrestrpos = NULL;
+   amroutine->ambuild = blbuild;
+   amroutine->ambuildempty = blbuildempty;
+   amroutine->ambulkdelete = blbulkdelete;
+   amroutine->amvacuumcleanup = blvacuumcleanup;
+   amroutine->amcanreturn = NULL;
+   amroutine->amcostestimate = blcostestimate;
+   amroutine->amoptions = bloptions;
+   amroutine->amvalidate = blvalidate;
+
+   PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * Fill BloomState structure for particular index.
+ */
+void
+initBloomState(BloomState *state, Relation index)
+{
+   int         i;
+
+   state->nColumns = index->rd_att->natts;
+
+   /* Initialize hash function for each attribute */
+   for (i = 0; i < index->rd_att->natts; i++)
+   {
+       fmgr_info_copy(&(state->hashFn[i]),
+                      index_getprocinfo(index, i + 1, BLOOM_HASH_PROC),
+                      CurrentMemoryContext);
+   }
+
+   /* Initialize amcache if needed with options from metapage */
+   if (!index->rd_amcache)
+   {
+       Buffer      buffer;
+       Page        page;
+       BloomMetaPageData *meta;
+       BloomOptions *opts;
+
+       opts = MemoryContextAlloc(index->rd_indexcxt, sizeof(BloomOptions));
+
+       buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO);
+       LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+       page = BufferGetPage(buffer);
+
+       if (!BloomPageIsMeta(page))
+           elog(ERROR, "Relation is not a bloom index");
+       meta = BloomPageGetMeta(BufferGetPage(buffer));
+
+       if (meta->magickNumber != BLOOM_MAGICK_NUMBER)
+           elog(ERROR, "Relation is not a bloom index");
+
+       *opts = meta->opts;
+
+       UnlockReleaseBuffer(buffer);
+
+       index->rd_amcache = (void *) opts;
+   }
+
+   state->opts = (BloomOptions *) index->rd_amcache;
+   state->sizeOfBloomTuple = BLOOMTUPLEHDRSZ +
+       sizeof(SignType) * state->opts->bloomLength;
+}
+
+/*
+ * Random generator copied from FreeBSD.  Using own random generator here for
+ * two reasons:
+ *
+ * 1) In this case random numbers are used for on-disk storage.  Usage of
+ *   PostgreSQL number generator would obstruct it from all possible changes.
+ * 2) Changing seed of PostgreSQL random generator would be undesirable side
+ *   effect.
+ */
+static int32 next;
+
+static int32
+myRand()
+{
+   /*
+    * Compute x = (7^5 * x) mod (2^31 - 1)
+    * without overflowing 31 bits:
+    *      (2^31 - 1) = 127773 * (7^5) + 2836
+    * From "Random number generators: good ones are hard to find",
+    * Park and Miller, Communications of the ACM, vol. 31, no. 10,
+    * October 1988, p. 1195.
+    */
+   int32 hi, lo, x;
+
+   /* Must be in [1, 0x7ffffffe] range at this point. */
+   hi = next / 127773;
+   lo = next % 127773;
+   x = 16807 * lo - 2836 * hi;
+   if (x < 0)
+       x += 0x7fffffff;
+   next = x;
+   /* Transform to [0, 0x7ffffffd] range. */
+   return (x - 1);
+}
+
+void
+mySrand(uint32 seed)
+{
+   next = seed;
+   /* Transform to [1, 0x7ffffffe] range. */
+   next = (next % 0x7ffffffe) + 1;
+}
+
+/*
+ * Add bits of given value to the signature.
+ */
+void
+signValue(BloomState *state, SignType *sign, Datum value, int attno)
+{
+   uint32      hashVal;
+   int         nBit,
+               j;
+
+   /*
+    * init generator with "column's" number to get "hashed" seed for new
+    * value. We don't want to map the same numbers from different columns
+    * into the same bits!
+    */
+   mySrand(attno);
+
+   /*
+    * Init hash sequence to map our value into bits. the same values in
+    * different columns will be mapped into different bits because of step
+    * above
+    */
+   hashVal = DatumGetInt32(FunctionCall1(&state->hashFn[attno], value));
+   mySrand(hashVal ^ myRand());
+
+   for (j = 0; j < state->opts->bitSize[attno]; j++)
+   {
+       /* prevent mutiple evaluation */
+       nBit = myRand() % (state->opts->bloomLength * BITSIGNTYPE);
+       SETBIT(sign, nBit);
+   }
+}
+
+/*
+ * Make bloom tuple from values.
+ */
+BloomTuple *
+BloomFormTuple(BloomState *state, ItemPointer iptr, Datum *values, bool *isnull)
+{
+   int         i;
+   BloomTuple *res = (BloomTuple *) palloc0(state->sizeOfBloomTuple);
+
+   res->heapPtr = *iptr;
+
+   /* Blooming each column */
+   for (i = 0; i < state->nColumns; i++)
+   {
+       /* skip nulls */
+       if (isnull[i])
+           continue;
+
+       signValue(state, res->sign, values[i], i);
+   }
+
+   return res;
+}
+
+/*
+ * Add new bloom tuple to the page.  Returns true if new tuple was successfully
+ * added to the page.  Returns false if it doesn't git the page.
+ */
+bool
+BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple)
+{
+   BloomTuple *itup;
+   BloomPageOpaque opaque;
+   Pointer     ptr;
+
+   /* Does new tuple fit the page */
+   if (BloomPageGetFreeSpace(state, page) < state->sizeOfBloomTuple)
+       return false;
+
+   /* Copy new tuple to the end of page */
+   opaque = BloomPageGetOpaque(page);
+   itup = BloomPageGetTuple(state, page, opaque->maxoff + 1);
+   memcpy((Pointer) itup, (Pointer) tuple, state->sizeOfBloomTuple);
+
+   /* Adjust maxoff and pd_lower */
+   opaque->maxoff++;
+   ptr = (Pointer) BloomPageGetTuple(state, page, opaque->maxoff + 1);
+   ((PageHeader) page)->pd_lower = ptr - page;
+
+   return true;
+}
+
+/*
+ * Allocate a new page (either by recycling, or by extending the index file)
+ * The returned buffer is already pinned and exclusive-locked
+ * Caller is responsible for initializing the page by calling BloomInitBuffer
+ */
+Buffer
+BloomNewBuffer(Relation index)
+{
+   Buffer      buffer;
+   bool        needLock;
+
+   /* First, try to get a page from FSM */
+   for (;;)
+   {
+       BlockNumber blkno = GetFreeIndexPage(index);
+
+       if (blkno == InvalidBlockNumber)
+           break;
+
+       buffer = ReadBuffer(index, blkno);
+
+       /*
+        * We have to guard against the possibility that someone else already
+        * recycled this page; the buffer may be locked if so.
+        */
+       if (ConditionalLockBuffer(buffer))
+       {
+           Page        page = BufferGetPage(buffer);
+
+           if (PageIsNew(page))
+               return buffer;  /* OK to use, if never initialized */
+
+           if (BloomPageIsDeleted(page))
+               return buffer;  /* OK to use */
+
+           LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+       }
+
+       /* Can't use it, so release buffer and try again */
+       ReleaseBuffer(buffer);
+   }
+
+   /* Must extend the file */
+   needLock = !RELATION_IS_LOCAL(index);
+   if (needLock)
+       LockRelationForExtension(index, ExclusiveLock);
+
+   buffer = ReadBuffer(index, P_NEW);
+   LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+   if (needLock)
+       UnlockRelationForExtension(index, ExclusiveLock);
+
+   return buffer;
+}
+
+/*
+ * Initialize bloom page.
+ */
+void
+BloomInitPage(Page page, uint16 flags)
+{
+   BloomPageOpaque opaque;
+
+   PageInit(page, BLCKSZ, sizeof(BloomPageOpaqueData));
+
+   opaque = BloomPageGetOpaque(page);
+   memset(opaque, 0, sizeof(BloomPageOpaqueData));
+   opaque->flags = flags;
+}
+
+/*
+ * Adjust options of bloom index.
+ */
+static void
+adjustBloomOptions(BloomOptions *opts)
+{
+   int             i;
+
+   /* Default length of bloom filter is 5 of 16-bit integers */
+   if (opts->bloomLength <= 0)
+       opts->bloomLength = 5;
+   else
+       opts->bloomLength = opts->bloomLength;
+
+   /* Check singnature length */
+   for (i = 0; i < INDEX_MAX_KEYS; i++)
+   {
+       /*
+        * Zero and negative number of bits is meaningless.  Also setting
+        * more bits than signature have seems useless.  Replace both cases
+        * with 2 bits default.
+        */
+       if (opts->bitSize[i] <= 0
+           || opts->bitSize[i] >= opts->bloomLength * sizeof(SignType))
+           opts->bitSize[i] = 2;
+   }
+}
+
+/*
+ * Initialize metapage for bloom index.
+ */
+void
+BloomInitMetapage(Relation index)
+{
+   Page        metaPage;
+   Buffer      metaBuffer;
+   BloomMetaPageData *metadata;
+   GenericXLogState *state;
+
+   /*
+    * Make a new buffer, since it first buffer it should be associated with
+    * block number 0 (BLOOM_METAPAGE_BLKNO).
+    */
+   metaBuffer = BloomNewBuffer(index);
+   Assert(BufferGetBlockNumber(metaBuffer) == BLOOM_METAPAGE_BLKNO);
+
+   /* Initialize bloom index options */
+   if (!index->rd_options)
+       index->rd_options = palloc0(sizeof(BloomOptions));
+   adjustBloomOptions((BloomOptions *) index->rd_options);
+
+   /* Initialize contents of meta page */
+   state = GenericXLogStart(index);
+   metaPage = GenericXLogRegister(state, metaBuffer, true);
+
+   BloomInitPage(metaPage, BLOOM_META);
+   metadata = BloomPageGetMeta(metaPage);
+   memset(metadata, 0, sizeof(BloomMetaPageData));
+   metadata->magickNumber = BLOOM_MAGICK_NUMBER;
+   metadata->opts = *((BloomOptions *) index->rd_options);
+   ((PageHeader) metaPage)->pd_lower += sizeof(BloomMetaPageData);
+
+   GenericXLogFinish(state);
+   UnlockReleaseBuffer(metaBuffer);
+}
+
+/*
+ * Initialize options for bloom index.
+ */
+bytea *
+bloptions(Datum reloptions, bool validate)
+{
+   relopt_value *options;
+   int         numoptions;
+   BloomOptions *rdopts;
+   relopt_parse_elt tab[INDEX_MAX_KEYS + 1];
+   int         i;
+   char        buf[16];
+
+   /* Option for length of signature */
+   tab[0].optname = "length";
+   tab[0].opttype = RELOPT_TYPE_INT;
+   tab[0].offset = offsetof(BloomOptions, bloomLength);
+
+   /* Number of bits for each of possible columns: col1, col2, ... */
+   for (i = 0; i < INDEX_MAX_KEYS; i++)
+   {
+       snprintf(buf, sizeof(buf), "col%d", i + 1);
+       tab[i + 1].optname = pstrdup(buf);
+       tab[i + 1].opttype = RELOPT_TYPE_INT;
+       tab[i + 1].offset = offsetof(BloomOptions, bitSize[i]);
+   }
+
+   options = parseRelOptions(reloptions, validate, bl_relopt_kind, &numoptions);
+   rdopts = allocateReloptStruct(sizeof(BloomOptions), options, numoptions);
+   fillRelOptions((void *) rdopts, sizeof(BloomOptions), options, numoptions,
+                  validate, tab, INDEX_MAX_KEYS + 1);
+
+   adjustBloomOptions(rdopts);
+
+   return (bytea *) rdopts;
+}
diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c
new file mode 100644 (file)
index 0000000..fb8d9b8
--- /dev/null
@@ -0,0 +1,212 @@
+/*-------------------------------------------------------------------------
+ *
+ * blvacuum.c
+ *     Bloom VACUUM functions.
+ *
+ * Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *   contrib/bloom/blvacuum.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "catalog/storage.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "postmaster/autovacuum.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+
+#include "bloom.h"
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+            IndexBulkDeleteCallback callback, void *callback_state)
+{
+   Relation    index = info->index;
+   BlockNumber blkno,
+               npages;
+   FreeBlockNumberArray notFullPage;
+   int         countPage = 0;
+   BloomState  state;
+   Buffer      buffer;
+   Page        page;
+   GenericXLogState *gxlogState;
+
+   if (stats == NULL)
+       stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+   initBloomState(&state, index);
+
+   /*
+    * Interate over the pages. We don't care about concurrently added pages,
+    * they can't contain tuples to delete.
+    */
+   npages = RelationGetNumberOfBlocks(index);
+   for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
+   {
+       BloomTuple *itup,
+                  *itupPtr,
+                  *itupEnd;
+
+       buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+                                   RBM_NORMAL, info->strategy);
+
+       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+       gxlogState = GenericXLogStart(index);
+       page = GenericXLogRegister(gxlogState, buffer, false);
+
+       if (BloomPageIsDeleted(page))
+       {
+           UnlockReleaseBuffer(buffer);
+           CHECK_FOR_INTERRUPTS();
+           continue;
+       }
+
+       /* Iterate over the tuples */
+       itup = BloomPageGetTuple(&state, page, 1);
+       itupPtr = BloomPageGetTuple(&state, page, 1);
+       itupEnd = BloomPageGetTuple(&state, page, BloomPageGetMaxOffset(page) + 1);
+       while (itup < itupEnd)
+       {
+           /* Do we have to delete this tuple? */
+           if (callback(&itup->heapPtr, callback_state))
+           {
+               stats->tuples_removed += 1;
+               BloomPageGetOpaque(page)->maxoff--;
+           }
+           else
+           {
+               if (itupPtr != itup)
+               {
+                   /*
+                    * If we already delete something before, we have to move
+                    * this tuple backward.
+                    */
+                   memmove((Pointer) itupPtr, (Pointer) itup,
+                           state.sizeOfBloomTuple);
+               }
+               stats->num_index_tuples++;
+               itupPtr = BloomPageGetNextTuple(&state, itupPtr);
+           }
+
+           itup = BloomPageGetNextTuple(&state, itup);
+       }
+
+       Assert(itupPtr == BloomPageGetTuple(&state, page, BloomPageGetMaxOffset(page) + 1));
+
+       if (!BloomPageIsDeleted(page) &&
+           BloomPageGetFreeSpace(&state, page) > state.sizeOfBloomTuple &&
+           countPage < BloomMetaBlockN)
+           notFullPage[countPage++] = blkno;
+
+       /* Did we delete something? */
+       if (itupPtr != itup)
+       {
+           /* Is it empty page now? */
+           if (itupPtr == BloomPageGetData(page))
+               BloomPageSetDeleted(page);
+           /* Adjust pg_lower */
+           ((PageHeader) page)->pd_lower = (Pointer) itupPtr - page;
+           /* Finish WAL-logging */
+           GenericXLogFinish(gxlogState);
+       }
+       else
+       {
+           /* Didn't change anything: abort WAL-logging */
+           GenericXLogAbort(gxlogState);
+       }
+       UnlockReleaseBuffer(buffer);
+       CHECK_FOR_INTERRUPTS();
+   }
+
+   if (countPage > 0)
+   {
+       BloomMetaPageData *metaData;
+
+       buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO);
+       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+       gxlogState = GenericXLogStart(index);
+       page = GenericXLogRegister(gxlogState, buffer, false);
+
+       metaData = BloomPageGetMeta(page);
+       memcpy(metaData->notFullPage, notFullPage, sizeof(FreeBlockNumberArray));
+       metaData->nStart = 0;
+       metaData->nEnd = countPage;
+
+       GenericXLogFinish(gxlogState);
+       UnlockReleaseBuffer(buffer);
+   }
+
+   return stats;
+}
+
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+   Relation    index = info->index;
+   BlockNumber npages,
+               blkno;
+   BlockNumber totFreePages;
+
+   if (info->analyze_only)
+       return stats;
+
+   if (stats == NULL)
+       stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+   /*
+    * Iterate over the pages: insert deleted pages into FSM and collect
+    * statistics.
+    */
+   npages = RelationGetNumberOfBlocks(index);
+   totFreePages = 0;
+   for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++)
+   {
+       Buffer      buffer;
+       Page        page;
+
+       vacuum_delay_point();
+
+       buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+                                   RBM_NORMAL, info->strategy);
+       LockBuffer(buffer, BUFFER_LOCK_SHARE);
+       page = (Page) BufferGetPage(buffer);
+
+       if (BloomPageIsDeleted(page))
+       {
+           RecordFreeIndexPage(index, blkno);
+           totFreePages++;
+       }
+       else
+       {
+           stats->num_index_tuples += BloomPageGetMaxOffset(page);
+           stats->estimated_count += BloomPageGetMaxOffset(page);
+       }
+
+       UnlockReleaseBuffer(buffer);
+   }
+
+   IndexFreeSpaceMapVacuum(info->index);
+   stats->pages_free = totFreePages;
+   stats->num_pages = RelationGetNumberOfBlocks(index);
+
+   return stats;
+}
diff --git a/contrib/bloom/blvalidate.c b/contrib/bloom/blvalidate.c
new file mode 100644 (file)
index 0000000..12e7c7d
--- /dev/null
@@ -0,0 +1,220 @@
+/*-------------------------------------------------------------------------
+ *
+ * blvalidate.c
+ *   Opclass validator for bloom.
+ *
+ * Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *   contrib/bloom/blvalidate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amvalidate.h"
+#include "access/htup_details.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_amproc.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+
+#include "bloom.h"
+
+/*
+ * Validator for a bloom opclass.
+ */
+bool
+blvalidate(Oid opclassoid)
+{
+   bool        result = true;
+   HeapTuple   classtup;
+   Form_pg_opclass classform;
+   Oid         opfamilyoid;
+   Oid         opcintype;
+   Oid         opckeytype;
+   char       *opclassname;
+   HeapTuple   familytup;
+   Form_pg_opfamily familyform;
+   char       *opfamilyname;
+   CatCList   *proclist,
+              *oprlist;
+   List       *grouplist;
+   OpFamilyOpFuncGroup *opclassgroup;
+   int         i;
+   ListCell   *lc;
+
+   /* Fetch opclass information */
+   classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
+   if (!HeapTupleIsValid(classtup))
+       elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
+   classform = (Form_pg_opclass) GETSTRUCT(classtup);
+
+   opfamilyoid = classform->opcfamily;
+   opcintype = classform->opcintype;
+   opckeytype = classform->opckeytype;
+   if (!OidIsValid(opckeytype))
+       opckeytype = opcintype;
+   opclassname = NameStr(classform->opcname);
+
+   /* Fetch opfamily information */
+   familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
+   if (!HeapTupleIsValid(familytup))
+       elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
+   familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
+
+   opfamilyname = NameStr(familyform->opfname);
+
+   /* Fetch all operators and support functions of the opfamily */
+   oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
+   proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
+
+   /* Check individual support functions */
+   for (i = 0; i < proclist->n_members; i++)
+   {
+       HeapTuple   proctup = &proclist->members[i]->tuple;
+       Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
+       bool        ok;
+
+       /*
+        * All bloom support functions should be registered with matching
+        * left/right types
+        */
+       if (procform->amproclefttype != procform->amprocrighttype)
+       {
+           ereport(INFO,
+                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                    errmsg("bloom opfamily %s contains support procedure %s with cross-type registration",
+                           opfamilyname,
+                           format_procedure(procform->amproc))));
+           result = false;
+       }
+
+       /*
+        * We can't check signatures except within the specific opclass, since
+        * we need to know the associated opckeytype in many cases.
+        */
+       if (procform->amproclefttype != opcintype)
+           continue;
+
+       /* Check procedure numbers and function signatures */
+       switch (procform->amprocnum)
+       {
+           case BLOOM_HASH_PROC:
+               ok = check_amproc_signature(procform->amproc, INT4OID, false,
+                                           1, 1, opckeytype);
+               break;
+           default:
+               ereport(INFO,
+                       (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                        errmsg("bloom opfamily %s contains function %s with invalid support number %d",
+                               opfamilyname,
+                               format_procedure(procform->amproc),
+                               procform->amprocnum)));
+               result = false;
+               continue;       /* don't want additional message */
+       }
+
+       if (!ok)
+       {
+           ereport(INFO,
+                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                    errmsg("gist opfamily %s contains function %s with wrong signature for support number %d",
+                           opfamilyname,
+                           format_procedure(procform->amproc),
+                           procform->amprocnum)));
+           result = false;
+       }
+   }
+
+   /* Check individual operators */
+   for (i = 0; i < oprlist->n_members; i++)
+   {
+       HeapTuple   oprtup = &oprlist->members[i]->tuple;
+       Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
+
+       /* Check it's allowed strategy for bloom */
+       if (oprform->amopstrategy < 1 ||
+           oprform->amopstrategy > BLOOM_NSTRATEGIES)
+       {
+           ereport(INFO,
+                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                    errmsg("bloom opfamily %s contains operator %s with invalid strategy number %d",
+                           opfamilyname,
+                           format_operator(oprform->amopopr),
+                           oprform->amopstrategy)));
+           result = false;
+       }
+
+       /* bloom doesn't support ORDER BY operators */
+       if (oprform->amoppurpose != AMOP_SEARCH ||
+           OidIsValid(oprform->amopsortfamily))
+       {
+           ereport(INFO,
+                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                    errmsg("bloom opfamily %s contains invalid ORDER BY specification for operator %s",
+                           opfamilyname,
+                           format_operator(oprform->amopopr))));
+           result = false;
+       }
+
+       /* Check operator signature --- same for all bloom strategies */
+       if (!check_amop_signature(oprform->amopopr, BOOLOID,
+                                 oprform->amoplefttype,
+                                 oprform->amoprighttype))
+       {
+           ereport(INFO,
+                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                    errmsg("bloom opfamily %s contains operator %s with wrong signature",
+                           opfamilyname,
+                           format_operator(oprform->amopopr))));
+           result = false;
+       }
+   }
+
+   /* Now check for inconsistent groups of operators/functions */
+   grouplist = identify_opfamily_groups(oprlist, proclist);
+   opclassgroup = NULL;
+   foreach(lc, grouplist)
+   {
+       OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
+
+       /* Remember the group exactly matching the test opclass */
+       if (thisgroup->lefttype == opcintype &&
+           thisgroup->righttype == opcintype)
+           opclassgroup = thisgroup;
+
+       /*
+        * There is not a lot we can do to check the operator sets, since each
+        * bloom opclass is more or less a law unto itself, and some contain
+        * only operators that are binary-compatible with the opclass datatype
+        * (meaning that empty operator sets can be OK).  That case also means
+        * that we shouldn't insist on nonempty function sets except for the
+        * opclass's own group.
+        */
+   }
+
+   /* Check that the originally-named opclass is complete */
+   for (i = 1; i <= BLOOM_NPROC; i++)
+   {
+       if (opclassgroup &&
+           (opclassgroup->functionset & (((uint64) 1) << i)) != 0)
+           continue;           /* got it */
+       ereport(INFO,
+               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                errmsg("bloom opclass %s is missing support function %d",
+                       opclassname, i)));
+       result = false;
+   }
+
+   ReleaseCatCacheList(proclist);
+   ReleaseCatCacheList(oprlist);
+   ReleaseSysCache(familytup);
+   ReleaseSysCache(classtup);
+
+   return result;
+}
diff --git a/contrib/bloom/expected/bloom.out b/contrib/bloom/expected/bloom.out
new file mode 100644 (file)
index 0000000..5e8269f
--- /dev/null
@@ -0,0 +1,122 @@
+CREATE EXTENSION bloom;
+CREATE TABLE tst (
+   i   int4,
+   t   text
+);
+INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;
+CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);
+SET enable_seqscan=on;
+SET enable_bitmapscan=off;
+SET enable_indexscan=off;
+SELECT count(*) FROM tst WHERE i = 7;
+ count 
+-------
+ 10000
+(1 row)
+
+SELECT count(*) FROM tst WHERE t = '5';
+ count 
+-------
+  6264
+(1 row)
+
+SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+ count 
+-------
+   588
+(1 row)
+
+SET enable_seqscan=off;
+SET enable_bitmapscan=on;
+SET enable_indexscan=on;
+EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7;
+                QUERY PLAN                 
+-------------------------------------------
+ Aggregate
+   ->  Bitmap Heap Scan on tst
+         Recheck Cond: (i = 7)
+         ->  Bitmap Index Scan on bloomidx
+               Index Cond: (i = 7)
+(5 rows)
+
+EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5';
+                QUERY PLAN                 
+-------------------------------------------
+ Aggregate
+   ->  Bitmap Heap Scan on tst
+         Recheck Cond: (t = '5'::text)
+         ->  Bitmap Index Scan on bloomidx
+               Index Cond: (t = '5'::text)
+(5 rows)
+
+EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Aggregate
+   ->  Bitmap Heap Scan on tst
+         Recheck Cond: ((i = 7) AND (t = '5'::text))
+         ->  Bitmap Index Scan on bloomidx
+               Index Cond: ((i = 7) AND (t = '5'::text))
+(5 rows)
+
+SELECT count(*) FROM tst WHERE i = 7;
+ count 
+-------
+ 10000
+(1 row)
+
+SELECT count(*) FROM tst WHERE t = '5';
+ count 
+-------
+  6264
+(1 row)
+
+SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+ count 
+-------
+   588
+(1 row)
+
+DELETE FROM tst;
+INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;
+VACUUM ANALYZE tst;
+SELECT count(*) FROM tst WHERE i = 7;
+ count 
+-------
+ 10000
+(1 row)
+
+SELECT count(*) FROM tst WHERE t = '5';
+ count 
+-------
+  6264
+(1 row)
+
+SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+ count 
+-------
+   588
+(1 row)
+
+VACUUM FULL tst;
+SELECT count(*) FROM tst WHERE i = 7;
+ count 
+-------
+ 10000
+(1 row)
+
+SELECT count(*) FROM tst WHERE t = '5';
+ count 
+-------
+  6264
+(1 row)
+
+SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+ count 
+-------
+   588
+(1 row)
+
+RESET enable_seqscan;
+RESET enable_bitmapscan;
+RESET enable_indexscan;
diff --git a/contrib/bloom/sql/bloom.sql b/contrib/bloom/sql/bloom.sql
new file mode 100644 (file)
index 0000000..f9d0ad4
--- /dev/null
@@ -0,0 +1,47 @@
+CREATE EXTENSION bloom;
+
+CREATE TABLE tst (
+   i   int4,
+   t   text
+);
+
+INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;
+CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);
+
+SET enable_seqscan=on;
+SET enable_bitmapscan=off;
+SET enable_indexscan=off;
+
+SELECT count(*) FROM tst WHERE i = 7;
+SELECT count(*) FROM tst WHERE t = '5';
+SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+
+SET enable_seqscan=off;
+SET enable_bitmapscan=on;
+SET enable_indexscan=on;
+
+EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7;
+EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5';
+EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+
+SELECT count(*) FROM tst WHERE i = 7;
+SELECT count(*) FROM tst WHERE t = '5';
+SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+
+DELETE FROM tst;
+INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;
+VACUUM ANALYZE tst;
+
+SELECT count(*) FROM tst WHERE i = 7;
+SELECT count(*) FROM tst WHERE t = '5';
+SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+
+VACUUM FULL tst;
+
+SELECT count(*) FROM tst WHERE i = 7;
+SELECT count(*) FROM tst WHERE t = '5';
+SELECT count(*) FROM tst WHERE i = 7 AND t = '5';
+
+RESET enable_seqscan;
+RESET enable_bitmapscan;
+RESET enable_indexscan;
diff --git a/contrib/bloom/t/001_wal.pl b/contrib/bloom/t/001_wal.pl
new file mode 100644 (file)
index 0000000..dbb6a90
--- /dev/null
@@ -0,0 +1,75 @@
+# Test generic xlog record work for bloom index replication.
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 31;
+
+my $node_master;
+my $node_standby;
+
+# Run few queries on both master and standby and check their results match.
+sub test_index_replay
+{
+   my ($test_name) = @_;
+
+   # Wait for standby to catch up
+   my $applname = $node_standby->name;
+   my $caughtup_query =
+       "SELECT pg_current_xlog_location() <= write_location FROM pg_stat_replication WHERE application_name = '$applname';";
+   $node_master->poll_query_until('postgres', $caughtup_query)
+     or die "Timed out while waiting for standby 1 to catch up";
+
+   my $queries = qq(SET enable_seqscan=off;
+SET enable_bitmapscan=on;
+SET enable_indexscan=on;
+SELECT * FROM tst WHERE i = 0;
+SELECT * FROM tst WHERE i = 3;
+SELECT * FROM tst WHERE t = 'b';
+SELECT * FROM tst WHERE t = 'f';
+SELECT * FROM tst WHERE i = 3 AND t = 'c';
+SELECT * FROM tst WHERE i = 7 AND t = 'e';
+);
+
+   # Run test queries and compare their result
+   my $master_result = $node_master->psql("postgres", $queries);
+   my $standby_result = $node_standby->psql("postgres", $queries);
+
+   is($master_result, $standby_result, "$test_name: query result matches");
+}
+
+# Initialize master node
+$node_master = get_new_node('master');
+$node_master->init(allows_streaming => 1);
+$node_master->start;
+my $backup_name = 'my_backup';
+
+# Take backup
+$node_master->backup($backup_name);
+
+# Create streaming standby linking to master
+$node_standby = get_new_node('standby');
+$node_standby->init_from_backup($node_master, $backup_name,
+   has_streaming => 1);
+$node_standby->start;
+
+# Create some bloom index on master
+$node_master->psql("postgres", "CREATE EXTENSION bloom;");
+$node_master->psql("postgres", "CREATE TABLE tst (i int4, t text);");
+$node_master->psql("postgres", "INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;");
+$node_master->psql("postgres", "CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);");
+
+# Test that queries give same result
+test_index_replay('initial');
+
+# Run 10 cycles of table modification. Run test queries after each modification.
+for my $i (1..10)
+{
+   $node_master->psql("postgres", "DELETE FROM tst WHERE i = $i;");
+   test_index_replay("delete $i");
+   $node_master->psql("postgres", "VACUUM tst;");
+   test_index_replay("vacuum $i");
+   my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000);
+   $node_master->psql("postgres", "INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series($start,$end) i;");
+   test_index_replay("insert $i");
+}
diff --git a/doc/src/sgml/bloom.sgml b/doc/src/sgml/bloom.sgml
new file mode 100644 (file)
index 0000000..c207e6d
--- /dev/null
@@ -0,0 +1,218 @@
+<!-- doc/src/sgml/bloom.sgml -->
+
+<sect1 id="bloom" xreflabel="bloom">
+ <title>bloom</title>
+
+ <indexterm zone="bloom">
+  <primary>bloom</primary>
+ </indexterm>
+
+ <para>
+  <literal>bloom</> is a contrib which implements index access method.  It comes
+  as example of custom access methods and generic WAL records usage.  But it
+  is also useful itself.
+ </para>
+
+ <sect2>
+  <title>Introduction</title>
+
+  <para>
+   Implementation of
+   <ulink url="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</ulink>
+   allows fast exclusion of non-candidate tuples.
+   Since signature is a lossy representation of all indexed attributes, 
+   search results should be rechecked using heap information. 
+   User can specify signature length (in uint16, default is 5) and the number of 
+   bits, which can be setted, per attribute (1 < colN < 2048).
+  </para>
+
+  <para>
+   This index is useful if table has many attributes and queries can include
+   their arbitary combinations.  Traditional <literal>btree</> index is faster
+   than bloom index, but it'd require too many indexes to support all possible 
+   queries, while one need only one bloom index.  Bloom index supports only 
+   equality comparison.  Since it's a signature file, not a tree, it always
+   should be readed fully, but sequentially, so index search performance is 
+   constant and doesn't depend on a query. 
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Parameters</title>
+
+  <para>
+   <literal>bloom</> indexes accept following parameters in <literal>WITH</>
+   clause.
+  </para>
+
+   <variablelist>
+   <varlistentry>
+    <term><literal>length</></term>
+    <listitem>
+     <para>
+      Length of signature in uint16 type values
+     </para>
+    </listitem>
+   </varlistentry>
+   </variablelist>
+   <variablelist>
+   <varlistentry>
+    <term><literal>col1 &mdash; col16</></term>
+    <listitem>
+     <para>
+      Number of bits for corresponding column
+     </para>
+    </listitem>
+   </varlistentry>
+   </variablelist>
+ </sect2>
+
+ <sect2>
+  <title>Examples</title>
+
+  <para>
+   Example of index definition is given below.
+  </para>
+
+<programlisting>
+CREATE INDEX bloomidx ON tbloom(i1,i2,i3) 
+       WITH (length=5, col1=2, col2=2, col3=4);
+</programlisting>
+
+  <para>
+   Here, we create bloom index with signature length 80 bits and attributes
+   i1, i2  mapped to 2 bits, attribute i3 - to 4 bits.
+  </para>
+
+  <para>
+   Example of index definition and usage is given below.
+  </para>
+
+<programlisting>
+CREATE TABLE tbloom AS
+SELECT
+    random()::int as i1,
+    random()::int as i2,
+    random()::int as i3,
+    random()::int as i4,
+    random()::int as i5,
+    random()::int as i6,
+    random()::int as i7,
+    random()::int as i8,
+    random()::int as i9,
+    random()::int as i10,
+    random()::int as i11,
+    random()::int as i12,
+    random()::int as i13
+FROM
+    generate_series(1,1000);
+CREATE INDEX bloomidx ON tbloom USING
+             bloom (i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12);
+SELECT pg_relation_size('bloomidx');
+CREATE index btree_idx ON tbloom(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12);
+SELECT pg_relation_size('btree_idx');
+</programlisting>
+
+<programlisting>
+=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
+                                                   QUERY PLAN
+-----------------------------------------------------------------------------------------------------------------
+ Bitmap Heap Scan on tbloom  (cost=1.50..5.52 rows=1 width=52) (actual time=0.057..0.057 rows=0 loops=1)
+   Recheck Cond: ((i2 = 20) AND (i10 = 15))
+   ->  Bitmap Index Scan on bloomidx  (cost=0.00..1.50 rows=1 width=0) (actual time=0.041..0.041 rows=9 loops=1)
+         Index Cond: ((i2 = 20) AND (i10 = 15))
+ Total runtime: 0.081 ms
+(5 rows)
+</programlisting>
+
+  <para>
+   Seqscan is slow.
+  </para>
+
+<programlisting>
+=# SET enable_bitmapscan = off;
+=# SET enable_indexscan = off;
+=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
+                                            QUERY PLAN
+--------------------------------------------------------------------------------------------------
+ Seq Scan on tbloom  (cost=0.00..25.00 rows=1 width=52) (actual time=0.162..0.162 rows=0 loops=1)
+   Filter: ((i2 = 20) AND (i10 = 15))
+ Total runtime: 0.181 ms
+(3 rows)
+</programlisting>
+
+ <para>
+  Btree index will be not used for this query.
+ </para>
+
+<programlisting>
+=# DROP INDEX bloomidx;
+=# CREATE INDEX btree_idx ON tbloom(i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12);
+=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15;
+                                            QUERY PLAN
+--------------------------------------------------------------------------------------------------
+ Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.210..0.210 rows=0 loops=1)
+   Filter: ((i2 = 20) AND (i10 = 15))
+ Total runtime: 0.250 ms
+(3 rows)
+</programlisting>
+ </sect2>
+
+ <sect2>
+  <title>Opclass interface</title>
+
+  <para>
+   Bloom opclass interface is simple.  It requires 1 supporting function:
+   hash function for indexing datatype.  And it provides 1 search operator:
+   equality operator.  The example below shows <literal>opclass</> definition
+   for <literal>text</> datatype.
+  </para>
+
+<programlisting>
+CREATE OPERATOR CLASS text_ops
+DEFAULT FOR TYPE text USING bloom AS
+    OPERATOR    1   =(text, text),
+    FUNCTION    1   hashtext(text);
+</programlisting>
+ </sect2>
+
+ <sect2>
+  <title>Limitation</title>
+  <para>
+
+   <itemizedlist>
+    <listitem>
+     <para>
+      For now, only opclasses for <literal>int4</>, <literal>text</> comes
+      with contrib.  However, users may define more of them.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      Only <literal>=</literal> operator is supported for search now.  But it's
+      possible to add support of arrays with contains and intersection
+      operations in future.
+     </para>
+    </listitem>
+   </itemizedlist>
+  </para>
+ </sect2>
+
+ <sect2>
+  <title>Authors</title>
+
+  <para>
+   Teodor Sigaev <email>teodor@postgrespro.ru</email>, Postgres Professional, Moscow, Russia
+  </para>
+
+  <para>
+   Alexander Korotkov <email>a.korotkov@postgrespro.ru</email>, Postgres Professional, Moscow, Russia
+  </para>
+
+  <para>
+   Oleg Bartunov <email>obartunov@postgrespro.ru</email>, Postgres Professional, Moscow, Russia
+  </para>
+ </sect2>
+
+</sect1>
index 4e3f337125185f85d5c212db55ad13e683a2de7d..c8708ecf8bbe923339f2565e462f442c0363e62d 100644 (file)
@@ -105,6 +105,7 @@ CREATE EXTENSION <replaceable>module_name</> FROM unpackaged;
  &adminpack;
  &auth-delay;
  &auto-explain;
+ &bloom;
  &btree-gin;
  &btree-gist;
  &chkpass;
index 9046f506281f0a4ea8778290312649ca1ab7b6ee..6c0ad3ffaa60fa52c33aa4b6551e981c2c35453f 100644 (file)
 <!ENTITY adminpack       SYSTEM "adminpack.sgml">
 <!ENTITY auth-delay      SYSTEM "auth-delay.sgml">
 <!ENTITY auto-explain    SYSTEM "auto-explain.sgml">
+<!ENTITY bloom           SYSTEM "bloom.sgml">
 <!ENTITY btree-gin       SYSTEM "btree-gin.sgml">
 <!ENTITY btree-gist      SYSTEM "btree-gist.sgml">
 <!ENTITY chkpass         SYSTEM "chkpass.sgml">