Обсуждение: faster version of AllocSetFreeIndex for x86 architecture

Поиск
Список
Период
Сортировка

faster version of AllocSetFreeIndex for x86 architecture

От
Atsushi Ogawa
Дата:
Hi,
I made a faster version of AllocSetFreeIndex for x86 architecture.

Attached files are benchmark programs and patch file.

 alloc_test.pl: benchmark script
 alloc_test.c: benchmark program
 aset_free_index.patch: patch for util/mmgr/aset.c

This benchmark compares the original function with a faster version.
To try the benchmark, only execute alloc_test.pl. This script compiles
alloc_test.c and execute the benchmark.

Results of benchmark script:
Xeon(Core architecture), RedHat EL4, gcc 3.4.6
 bytes   :     4     8    16    32    64   128   256   512  1024   mix
 original: 0.780 0.780 0.820 0.870 0.930 0.970 1.030 1.080 1.130 0.950
 patched : 0.380 0.170 0.170 0.170 0.170 0.180 0.170 0.180 0.180 0.280

Core2, Windows XP, gcc 3.4.4 (cygwin)
 bytes   :     4     8    16    32    64   128   256   512  1024   mix
 original: 0.249 0.249 0.515 0.452 0.577 0.671 0.796 0.890 0.999 1.577
 patched : 0.358 0.218 0.202 0.218 0.218 0.218 0.202 0.218 0.218 0.218

Xeon(Pentium4 architecture), RedHal EL4, gcc 3.4.6
 bytes   :     4     8    16    32    64   128   256   512  1024   mix
 original: 0.510 0.520 0.620 0.860 0.970 1.260 1.150 1.220 1.290 0.860
 patched : 0.620 0.530 0.530 0.540 0.540 0.530 0.540 0.530 0.530 0.490

The effect of the patch that I measured by oprofile is:
- test program: pgbench -c 1 -t 50000 (fsync=off)

original:
CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated)
Counted GLOBAL_POWER_EVENTS events
with a unit mask of 0x01 (mandatory) count 100000
samples  %        symbol name
66854     6.6725  AllocSetAlloc
47679     4.7587  base_yyparse
29058     2.9002  hash_search_with_hash_value
22053     2.2011  SearchCatCache
19264     1.9227  MemoryContextAllocZeroAligned
16223     1.6192  base_yylex
13819     1.3792  ScanKeywordLookup
13305     1.3279  expression_tree_walker
12144     1.2121  LWLockAcquire
11850     1.1827  XLogInsert
11817     1.1794  AllocSetFree

patched:
CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated)
Counted GLOBAL_POWER_EVENTS events
with a unit mask of 0x01 (mandatory) count 100000
samples  %        symbol name
47610     4.9333  AllocSetAlloc
47441     4.9158  base_yyparse
28243     2.9265  hash_search_with_hash_value
22197     2.3000  SearchCatCache
18984     1.9671  MemoryContextAllocZeroAligned
15747     1.6317  base_yylex
13368     1.3852  ScanKeywordLookup
12889     1.3356  expression_tree_walker
12092     1.2530  LWLockAcquire
12078     1.2515  XLogInsert
(skip)
6248      0.6474  AllocSetFree

I think this patch improves AllocSetAlloc/AllocSetFree performance.

Best regards,

---
Atsushi Ogawa
a_ogawa@hi-ho.ne.jp




#!/usr/bin/perl

system "gcc -O2 -o alloc_test alloc_test.c";

my @test_bytes = (4,8,16,32,64,128,256,512,1024,
    '8 16 28 36 12 4 8 64 1024 8 24 12 8 64 16');
my $cnt = 10000000;

my @old_result;
my @new_result;
my $t0, $t1, $e;

foreach $e (@test_bytes) {
    $t0 = (times)[2];
    system "./alloc_test old $cnt $e";
    push @old_result, (times)[2] - $t0;

    $t0 = (times)[2];
    system "./alloc_test new $cnt $e";
    push @new_result, (times)[2] - $t0;
}

print " bytes   : ";
foreach $e (@test_bytes) {
    $e = 'mix' if($e =~ /\d+ \d+/);
    printf("%5s ", $e);
}
print "\n";

print " original: ";
foreach $e (@old_result) { printf("%.3f ", $e); }
print "\n";

print " patched : ";
foreach $e (@new_result) { printf("%.3f ", $e); }
print "\n";

#include <stdio.h>

#define Assert(condition)

#define ALLOC_MINBITS 3
#define ALLOCSET_NUM_FREELISTS 11
typedef size_t Size;

/*
 * faster version of AllocSetFreeIndex for x86 architecure.
 * this function runs in O(1).
 */
static inline int
AllocSetFreeIndex_new(Size size)
{
    int idx;

    if (__builtin_expect(size < (1 << ALLOC_MINBITS), 0))
        size = (1 << ALLOC_MINBITS);

    /* bsr(Bit Scan Reverse): Search the most significant set bit */
    __asm__ ("bsr %1, %0" :"=r"(idx) :"g"(size - 1));

    return idx - (ALLOC_MINBITS - 1);
}

static inline int
AllocSetFreeIndex(Size size)
{
    int         idx = 0;

    if (size > 0)
    {
        size = (size - 1) >> ALLOC_MINBITS;
        while (size != 0)
        {
            idx++;
            size >>= 1;
        }
        Assert(idx < ALLOCSET_NUM_FREELISTS);
    }

    return idx;
}

int main(int argc, char *argv[])
{
    int        loop_cnt;
    int        size[16];
    int        i, j;
    int        result = 0;

    if(argc < 4) {
        fprintf(stderr, "usage: asettest (new|old) loop_cnt size...\n");
        return 1;
    }

    loop_cnt = atoi(argv[2]);

    for(i = 0; i < 16; i++) {
        if(argc <= i + 3) {
            size[i] = size[0];
        } else {
            size[i] = atoi(argv[i + 3]);
        }
    }

    if(strcmp(argv[1], "new") == 0) {
        for(i = 0; i < loop_cnt; i++) {
            for(j = 0; j < 16; j++) {
                result += AllocSetFreeIndex_new(size[j]);
            }
        }
    } else if(strcmp(argv[1], "old") == 0) {
        for(i = 0; i < loop_cnt; i++) {
            for(j = 0; j < 16; j++) {
                result += AllocSetFreeIndex(size[j]);
            }
        }
    } else {
        fprintf(stderr, "usage: asettest (new|old) size loop_cnt\n");
        return 1;
    }

    fprintf(stderr, "%s, size:%d, loop:%d, checksum:%d\n",
        argv[1], size[0], loop_cnt, result);

    return 0;
}

*** ./src/backend/utils/mmgr/aset.c.orig    2009-06-01 23:12:10.000000000 +0900
--- ./src/backend/utils/mmgr/aset.c    2009-06-02 08:47:30.000000000 +0900
***************
*** 263,268 ****
--- 263,287 ----
   *        that size <= ALLOC_CHUNK_LIMIT.
   * ----------
   */
+ #if defined(__i386__) && defined(__GNUC__)
+ /*
+  * faster version of AllocSetFreeIndex for x86 architecure.
+  * this function runs in O(1).
+  */
+ static inline int
+ AllocSetFreeIndex(Size size)
+ {
+     int idx;
+
+     if (__builtin_expect(size < (1 << ALLOC_MINBITS), 0))
+         size = (1 << ALLOC_MINBITS);
+
+     /* bsr(Bit Scan Reverse): Search the most significant set bit */
+     __asm__ ("bsr %1, %0" :"=r"(idx) :"g"(size - 1));
+
+     return idx - (ALLOC_MINBITS - 1);
+ }
+ #else
  static inline int
  AllocSetFreeIndex(Size size)
  {
***************
*** 281,286 ****
--- 300,306 ----

      return idx;
  }
+ #endif /* defined(__i386__) && defined(__GNUC__) */

  #ifdef RANDOMIZE_ALLOCATED_MEMORY


Re: faster version of AllocSetFreeIndex for x86 architecture

От
Jeremy Kerr
Дата:
Hi,

> I made a faster version of AllocSetFreeIndex for x86 architecture.

Neat, I have a version for PowerPC too.

In order to prevent writing multiple copies of AllocSetFreeIndex, I 
propose that we add a fls() function ("find last set"); this can be 
defined in an architecture-independent manner (ie, shift mask & test in 
a loop), and re-defined for arches that have faster ways of doing the 
same (ie, cntlz instruction on powerpc).

We can then change AllocSetFreeIndex to use fls().

Patches coming...



Jeremy


[PATCH 1/2] Add bit operations util header

От
Jeremy Kerr
Дата:
Add a utility header for simple bit operatios - bitops.h.

At present, just contains the fls() (find last set bit) function.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

---src/include/utils/bitops.h |   52 +++++++++++++++++++++++++++++++++++++++++++++1 file changed, 52 insertions(+)

diff --git a/src/include/utils/bitops.h b/src/include/utils/bitops.h
new file mode 100644
index 0000000..de11624
--- /dev/null
+++ b/src/include/utils/bitops.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitops.h
+ *      Simple bit operations.
+ *
+ * Portions Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BITOPS_H
+#define BITOPS_H
+
+#if defined(__ppc__) || defined(__powerpc__) || \
+    defined(__ppc64__) || defined (__powerpc64__)
+
+static inline int
+fls(unsigned int x)
+{
+    int lz;
+    asm("cntlz %0,%1" : "=r" (lz) : "r" (x));
+    return 32 - lz;
+}
+
+#else /* !powerpc */
+
+/* Architecture-independent implementations */
+
+/*
+ * fls: find last set bit.
+ *
+ * Returns the 1-based index of the most-significant bit in x. The MSB
+ * is bit number 32, the LSB is bit number 1. If x is zero, returns zero.
+ */
+static inline int
+fls(unsigned int x)
+{
+    int ls = 0;
+
+    while (x != 0)
+    {
+        ls++;
+        x >>= 1;
+    }
+
+    return ls;
+}
+
+#endif
+
+#endif /* BITOPS_H */


[PATCH 2/2] Use fls() to find chunk set

От
Jeremy Kerr
Дата:
Results in a ~2% performance increase by using the powerpc fls()
implementation.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

---src/backend/utils/mmgr/aset.c |    8 ++------1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 0e2d4d5..762cf72 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -65,6 +65,7 @@#include "postgres.h"#include "utils/memutils.h"
+#include "utils/bitops.h"/* Define this to detail debug alloc information *//* #define HAVE_ALLOCINFO */
@@ -270,12 +271,7 @@ AllocSetFreeIndex(Size size)    if (size > 0)    {
-        size = (size - 1) >> ALLOC_MINBITS;
-        while (size != 0)
-        {
-            idx++;
-            size >>= 1;
-        }
+        idx = fls((size - 1) >> ALLOC_MINBITS);        Assert(idx < ALLOCSET_NUM_FREELISTS);    }


Re: [PATCH 1/2] Add bit operations util header

От
Tom Lane
Дата:
Jeremy Kerr <jk@ozlabs.org> writes:
> Add a utility header for simple bit operatios - bitops.h.

This will fail outright on any non-gcc compiler.
        regards, tom lane


[PATCH v2] Add bit operations util header

От
Jeremy Kerr
Дата:
Add a utility header for simple bit operatios - bitops.h.

At present, just contains the fls() (find last set bit) function.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

---
v2: only use inline asm with gcc

---src/include/utils/bitops.h |   53 +++++++++++++++++++++++++++++++++++++++++++++1 file changed, 53 insertions(+)

diff --git a/src/include/utils/bitops.h b/src/include/utils/bitops.h
new file mode 100644
index 0000000..4f2bbc9
--- /dev/null
+++ b/src/include/utils/bitops.h
@@ -0,0 +1,53 @@
+/*-------------------------------------------------------------------------
+ *
+ * bitops.h
+ *      Simple bit operations.
+ *
+ * Portions Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BITOPS_H
+#define BITOPS_H
+
+#if defined(__GNUC__) && \
+    (defined(__ppc__) || defined(__powerpc__) || \
+     defined(__ppc64__) || defined (__powerpc64__))
+
+static inline int
+fls(unsigned int x)
+{
+    int lz;
+    asm("cntlz %0,%1" : "=r" (lz) : "r" (x));
+    return 32 - lz;
+}
+
+#else /* !(gcc && powerpc) */
+
+/* Architecture-independent implementations */
+
+/*
+ * fls: find last set bit.
+ *
+ * Returns the 1-based index of the most-significant bit in x. The MSB
+ * is bit number 32, the LSB is bit number 1. If x is zero, returns zero.
+ */
+static inline int
+fls(unsigned int x)
+{
+    int ls = 0;
+
+    while (x != 0)
+    {
+        ls++;
+        x >>= 1;
+    }
+
+    return ls;
+}
+
+#endif
+
+#endif /* BITOPS_H */


Re: [PATCH v2] Add bit operations util header

От
Florian Weimer
Дата:
* Jeremy Kerr:

> +#if defined(__GNUC__) && \
> +    (defined(__ppc__) || defined(__powerpc__) || \
> +     defined(__ppc64__) || defined (__powerpc64__))

If you require GCC anyway, you can use __builtin_clz instead.
(It's been available since GCC 4.1 at least.)

--
Florian Weimer                <fweimer@bfk.de>
BFK edv-consulting GmbH       http://www.bfk.de/
Kriegsstraße 100              tel: +49-721-96201-1
D-76133 Karlsruhe             fax: +49-721-96201-99


Re: [PATCH v2] Add bit operations util header

От
Jeremy Kerr
Дата:
Florian,

> > +#if defined(__GNUC__) && \
> > +    (defined(__ppc__) || defined(__powerpc__) || \
> > +     defined(__ppc64__) || defined (__powerpc64__))
>
> If you require GCC anyway, you can use __builtin_clz instead.
> (It's been available since GCC 4.1 at least.)

Because now we have to test the compiler *and* the version as well?

But I do agree that using the builtins makes for much better code; I'm 
looking at a future change that does this.

Cheers,


Jeremy



Re: [PATCH v2] Add bit operations util header

От
Florian Weimer
Дата:
* Jeremy Kerr:

> Florian,
>
>> > +#if defined(__GNUC__) && \
>> > +    (defined(__ppc__) || defined(__powerpc__) || \
>> > +     defined(__ppc64__) || defined (__powerpc64__))
>>
>> If you require GCC anyway, you can use __builtin_clz instead.
>> (It's been available since GCC 4.1 at least.)
>
> Because now we have to test the compiler *and* the version as well?

This builtin is not architecture-specific, so you'd save the
architecture check.

--
Florian Weimer                <fweimer@bfk.de>
BFK edv-consulting GmbH       http://www.bfk.de/
Kriegsstraße 100              tel: +49-721-96201-1
D-76133 Karlsruhe             fax: +49-721-96201-99


Re: [PATCH v2] Add bit operations util header

От
Tom Lane
Дата:
Florian Weimer <fweimer@bfk.de> writes:
> * Jeremy Kerr:
>> Because now we have to test the compiler *and* the version as well?

> This builtin is not architecture-specific, so you'd save the
> architecture check.

The appropriate way to handle it would be a configure probe to see if
the function is available, thus avoiding any wired-in knowledge about
compiler or compiler version *or* architecture.

The other thing I didn't like about the patch was the assumption that
it's okay to have a "static inline" function in a header.  You can
get away with that in gcc but *not* in other compilers.  Look at the
existing coding patterns for, eg, list_head; then go thou and do
likewise.  Or, since there's currently no need for the code outside
aset.c, forget about putting it in a header and just plop it into
aset.c.
        regards, tom lane


Re: [PATCH v2] Add bit operations util header

От
Jeremy Kerr
Дата:
Hi Tom,

> The other thing I didn't like about the patch was the assumption that
> it's okay to have a "static inline" function in a header.  You can
> get away with that in gcc but *not* in other compilers.

Gee, you user-space guys have it tough! :D

Point taken, will rework.

> Look at the existing coding patterns for, eg, list_head; then go thou
> and do likewise.  Or, since there's currently no need for the code
> outside aset.c, forget about putting it in a header and just plop it
> into aset.c.

OK, I'll add a configure check and conditionally use the builtin if it's 
available. I have some other patches that could be improved by using 
other builtins, so it would be a good opportunity to figure out a nice 
pattern for doing this.

Cheers,


Jeremy


[RFC,PATCH] Avoid manual shift-and-test logic in AllocSetFreeIndex

От
Jeremy Kerr
Дата:
Move the shift-and-test login into a separate fls() function, which
can use __builtin_clz() if it's available.

This requires a new check for __builtin_clz in the configure script.

Results in a ~2% performance increase on PowerPC.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

---configure.in                  |   13 +++++++++++++src/backend/utils/mmgr/aset.c |   32
++++++++++++++++++++++++++------2files changed, 39 insertions(+), 6 deletions(-)
 

diff --git a/configure.in b/configure.in
index b8d2685..6a317b0 100644
--- a/configure.in
+++ b/configure.in
@@ -1361,6 +1361,19 @@ case $host_os in        AC_FUNC_FSEEKO;;esac
+# GCC builtins
+#
+# We need AC_TRY_LINK here, as the prototype generated by AC_CHECK_FUNC
+# will cause gcc to try to reference a non-builtin symbol.
+
+AC_MSG_CHECKING([for __builtin_clz])
+AC_TRY_LINK([],
+        [__builtin_clz(0);],
+        [AC_DEFINE(HAVE_BUILTIN_CLZ, 1,
+                [Define to 1 if you have __builtin_clz().])
+            AC_MSG_RESULT(yes)],
+        [AC_MSG_RESULT(no)])
+## Pthreads
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 0e2d4d5..af352b8 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -255,6 +255,31 @@ static MemoryContextMethods AllocSetMethods = {#define AllocAllocInfo(_cxt, _chunk)#endif
+/*
+ * fls: find last set bit.
+ *
+ * Returns the 1-based index of the most-significant bit in x. The MSB
+ * is bit number 32, the LSB is bit number 1. If x is zero, the result is
+ * undefined.
+ */
+static inline int
+fls(unsigned int x)
+{
+#ifdef HAVE_BUILTIN_CLZ
+    return 32 - __builtin_clz(x);
+#else
+    int ls = 0;
+
+    while (x != 0)
+    {
+        ls++;
+        x >>= 1;
+    }
+
+    return ls;
+#endif
+}
+/* ---------- * AllocSetFreeIndex - *
@@ -270,12 +295,7 @@ AllocSetFreeIndex(Size size)    if (size > 0)    {
-        size = (size - 1) >> ALLOC_MINBITS;
-        while (size != 0)
-        {
-            idx++;
-            size >>= 1;
-        }
+        idx = fls((size - 1) >> ALLOC_MINBITS);        Assert(idx < ALLOCSET_NUM_FREELISTS);    }


Re: [RFC,PATCH] Avoid manual shift-and-test logic in AllocSetFreeIndex

От
Atsushi Ogawa
Дата:
> +/*
> + * fls: find last set bit.
> + *
> + * Returns the 1-based index of the most-significant bit in x. The MSB
> + * is bit number 32, the LSB is bit number 1. If x is zero, the result is
> + * undefined.
> + */
> +static inline int
> +fls(unsigned int x)
...
> +        idx = fls((size - 1) >> ALLOC_MINBITS);

If size <= 8, fls((size - 1) >> ALLOC_MINBITS) is fls(0).
The result of fls(0) is undefined.

I think we have to never call fls(0) from AllocSetFreeIndex().
My proposal code:
    if (size > (1 << ALLOC_MINBITS))    {        idx = fls((size - 1) >> ALLOC_MINBITS);        Assert(idx <
ALLOCSET_NUM_FREELISTS);   }
 

Best regards,

---
Atsushi Ogawa


Re: faster version of AllocSetFreeIndex for x86 architecture

От
Simon Riggs
Дата:
On Tue, 2009-06-02 at 23:53 +0900, Atsushi Ogawa wrote:

> I made a faster version of AllocSetFreeIndex for x86 architecture.

> Results of benchmark script:
> Xeon(Core architecture), RedHat EL4, gcc 3.4.6
>  bytes   :     4     8    16    32    64   128   256   512  1024   mix
>  original: 0.780 0.780 0.820 0.870 0.930 0.970 1.030 1.080 1.130 0.950
>  patched : 0.380 0.170 0.170 0.170 0.170 0.180 0.170 0.180 0.180 0.280

> The effect of the patch that I measured by oprofile is:
> - test program: pgbench -c 1 -t 50000 (fsync=off)
> 
> original:
> CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated)
> Counted GLOBAL_POWER_EVENTS events
> with a unit mask of 0x01 (mandatory) count 100000
> samples  %        symbol name
> 66854     6.6725  AllocSetAlloc

> patched:
> CPU: P4 / Xeon with 2 hyper-threads, speed 2793.55 MHz (estimated)
> Counted GLOBAL_POWER_EVENTS events
> with a unit mask of 0x01 (mandatory) count 100000
> samples  %        symbol name
> 47610     4.9333  AllocSetAlloc

> I think this patch improves AllocSetAlloc/AllocSetFree performance.

Looks like very good work. Much appreciated.

-- Simon Riggs           www.2ndQuadrant.comPostgreSQL Training, Services and Support



Re: [RFC,PATCH] Avoid manual shift-and-test logic in AllocSetFreeIndex

От
Jeremy Kerr
Дата:
Hi Atsushi,

> If size <= 8, fls((size - 1) >> ALLOC_MINBITS) is fls(0).
> The result of fls(0) is undefined.

Yep, got caught out by this because my previous fls() supported zero.

> I think we have to never call fls(0) from AllocSetFreeIndex().
> My proposal code:
>
>      if (size > (1 << ALLOC_MINBITS))
>      {
>          idx = fls((size - 1) >> ALLOC_MINBITS);
>          Assert(idx < ALLOCSET_NUM_FREELISTS);
>      }

Looks good, I'll send an updated patch.

Also, are you still seeing the same improvement with the __builtin_clz 
as your inline asm implementation?

Cheers,


Jeremy


[PATCH v2] Avoid manual shift-and-test logic in AllocSetFreeIndex

От
Jeremy Kerr
Дата:
Move the shift-and-test login into a separate fls() function, which
can use __builtin_clz() if it's available.

This requires a new check for __builtin_clz in the configure script.

Results in a ~2% performance increase on PowerPC.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

---
v2: prevent fls(0)

---configure.in                  |   13 +++++++++++++src/backend/utils/mmgr/aset.c |   34
+++++++++++++++++++++++++++-------2files changed, 40 insertions(+), 7 deletions(-)
 

diff --git a/configure.in b/configure.in
index b8d2685..6a317b0 100644
--- a/configure.in
+++ b/configure.in
@@ -1361,6 +1361,19 @@ case $host_os in        AC_FUNC_FSEEKO;;esac
+# GCC builtins
+#
+# We need AC_TRY_LINK here, as the prototype generated by AC_CHECK_FUNC
+# will cause gcc to try to reference a non-builtin symbol.
+
+AC_MSG_CHECKING([for __builtin_clz])
+AC_TRY_LINK([],
+        [__builtin_clz(0);],
+        [AC_DEFINE(HAVE_BUILTIN_CLZ, 1,
+                [Define to 1 if you have __builtin_clz().])
+            AC_MSG_RESULT(yes)],
+        [AC_MSG_RESULT(no)])
+## Pthreads
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 0e2d4d5..9eb3117 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -255,6 +255,31 @@ static MemoryContextMethods AllocSetMethods = {#define AllocAllocInfo(_cxt, _chunk)#endif
+/*
+ * fls: find last set bit.
+ *
+ * Returns the 1-based index of the most-significant bit in x. The MSB
+ * is bit number 32, the LSB is bit number 1. If x is zero, the result is
+ * undefined.
+ */
+static inline int
+fls(unsigned int x)
+{
+#ifdef HAVE_BUILTIN_CLZ
+    return 32 - __builtin_clz(x);
+#else
+    int ls = 0;
+
+    while (x != 0)
+    {
+        ls++;
+        x >>= 1;
+    }
+
+    return ls;
+#endif
+}
+/* ---------- * AllocSetFreeIndex - *
@@ -268,14 +293,9 @@ AllocSetFreeIndex(Size size){    int            idx = 0;
-    if (size > 0)
+    if (size > (1 << ALLOC_MINBITS))    {
-        size = (size - 1) >> ALLOC_MINBITS;
-        while (size != 0)
-        {
-            idx++;
-            size >>= 1;
-        }
+        idx = fls((size - 1) >> ALLOC_MINBITS);        Assert(idx < ALLOCSET_NUM_FREELISTS);    }


Re: [RFC,PATCH] Avoid manual shift-and-test logic in AllocSetFreeIndex

От
Atsushi Ogawa
Дата:
Hi,

> Also, are you still seeing the same improvement with the __builtin_clz 
> as your inline asm implementation?

In my benchmark program, it is a little different performance
in fls implementation and inline asm implementation.
However, the result of a pgbench is almost the same improvement.

Here is the result of my benchmark.

Xeon(Core architecture) bytes     :     4     8    16    32    64   128   256   512  1024   mix original  : 0.780 0.790
0.8200.870 0.930 0.980 1.040 1.080 1.140 0.910 inline asm: 0.320 0.180 0.190 0.180 0.190 0.180 0.190 0.180 0.190 0.170
fls      : 0.270 0.260 0.290 0.290 0.290 0.290 0.290 0.300 0.290 0.380
 

Xeon(P4 architecrure) bytes     :     4     8    16    32    64   128   256   512  1024   mix original  : 0.520 0.520
0.6700.780 0.950 1.000 1.060 1.190 1.250 0.940 inline asm: 0.610 0.530 0.530 0.520 0.520 0.540 0.540 0.580 0.540 0.600
fls      : 0.390 0.370 0.780 0.780 0.780 0.790 0.780 0.780 0.780 0.520
 


pgbench result (measured by oprofile)

CPU: Xeon(P4 architecrure)
test program: pgbench -c 1 -t 50000 (fsync=off)

original
samples  %        symbol name
66854     6.6725  AllocSetAlloc
11817     1.1794  AllocSetFree

inline asm
samples  %        symbol name
47610     4.9333  AllocSetAlloc
6248      0.6474  AllocSetFree

fls
samples  %        symbol name
48779     4.9954  AllocSetAlloc
7648      0.7832  AllocSetFree

Best regards,

---
Atsushi Ogawa


Re: [RFC,PATCH] Avoid manual shift-and-test logic in AllocSetFreeIndex

От
Jeremy Kerr
Дата:
Hi Atsushi,

> In my benchmark program, it is a little different performance
> in fls implementation and inline asm implementation.
> However, the result of a pgbench is almost the same improvement.
>
> Here is the result of my benchmark.

Excellent, thank you for getting this extra set of numbers.

Cheers,


Jeremy