malloc: Use current (C11-style) atomics for fastbin access

Message ID 87va52nupb.fsf@oldenburg.str.redhat.com
State New
Headers show
Series
  • malloc: Use current (C11-style) atomics for fastbin access
Related show

Commit Message

Florian Weimer Nov. 12, 2018, 4:09 p.m.
This is another cleanup patch in preparation of the extended heap
protector (which will cover the fd/bk/fd_nextsize/bk_nextsize fields in
struct malloc_chunk, too).

By the way, there is an optimization opportunity for the tcache backfill
in _int_malloc: After the acquire MO load on the fastbin list head, we
can traverse the fastbin list as far as we need in order to refill the
tcache, and update the new list head with a single CAS.  This does not
have races (ABA races and the like) because we have acquired the arena
lock at this point.  Some backoff is probably needed in case the fastbin
list head is contended.  But it is probably a good idea to do the first
traversal at least once.

Thanks,
Florian

2018-11-12  Florian Weimer  <fweimer@redhat.com>

	* malloc/malloc.c (fastbin_push_entry): New function.
	(fastbin_pop_entry): Likewise.  Replaces REMOVE_FB.
	(REMOVE_FB): Remove macro.
	(_int_malloc): Use fastbin_pop_entry and reindent.
	(_int_free): Use fastbin_push_entry.
	(malloc_consolidate): Use atomic_exchange_acquire.

Comments

Anton Blanchard Jan. 15, 2019, 10:26 p.m. | #1
Hi Florian,

> This is another cleanup patch in preparation of the extended heap

> protector (which will cover the fd/bk/fd_nextsize/bk_nextsize fields

> in struct malloc_chunk, too).

> 

> By the way, there is an optimization opportunity for the tcache

> backfill in _int_malloc: After the acquire MO load on the fastbin

> list head, we can traverse the fastbin list as far as we need in

> order to refill the tcache, and update the new list head with a

> single CAS.  This does not have races (ABA races and the like)

> because we have acquired the arena lock at this point.  Some backoff

> is probably needed in case the fastbin list head is contended.  But

> it is probably a good idea to do the first traversal at least once.


I see a 16% regression on ppc64le with a simple threaded malloc test
case. I guess the C11 atomics aren't as good as what we have in glibc.

Thanks,
Anton
--

#include <malloc.h>
#include <pthread.h>
#include <unistd.h>

#define SIZE 8
#define BATCH 128
#define ITERATIONS (50000000/BATCH)

void *ptrs[BATCH];

static void __attribute__((noinline)) do_one(void)
{
	for (unsigned long i = 0; i < BATCH; i++)
		ptrs[i] = malloc(SIZE);

	asm volatile("# %0": : "m"(ptrs[0]));

	for (unsigned long i = 0; i < BATCH; i++)
		free(ptrs[i]);
}

static void *doit(void *junk)
{
	while (1)
		sleep(3600);

	return NULL;
}

int main(void)
{
	unsigned int i;
	pthread_t tid;

	pthread_create(&tid, NULL, doit, NULL);

	for (i = 0; i < ITERATIONS; i++)
		do_one();

	return 0;
}

> Thanks,

> Florian

> 

> 2018-11-12  Florian Weimer  <fweimer@redhat.com>

> 

> 	* malloc/malloc.c (fastbin_push_entry): New function.

> 	(fastbin_pop_entry): Likewise.  Replaces REMOVE_FB.

> 	(REMOVE_FB): Remove macro.

> 	(_int_malloc): Use fastbin_pop_entry and reindent.

> 	(_int_free): Use fastbin_push_entry.

> 	(malloc_consolidate): Use atomic_exchange_acquire.

> 

> diff --git a/malloc/malloc.c b/malloc/malloc.c

> index bfc605aa3e..7c2186c307 100644

> --- a/malloc/malloc.c

> +++ b/malloc/malloc.c

> @@ -1316,6 +1316,77 @@ nextchunk->

> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

> #define set_foot(p, s)       (((mchunkptr) ((char *) (p) +

> (s)))->mchunk_prev_size = (s)) 

> +/* Add an item to the atomic fastbin list at *ROOT.  Returns the old

> +   value at *ROOT.  Note that properties of the old chunk are only

> +   stable if the caller has acquired the arena lock.  With out the

> +   lock, it can be deallocated at any time.  */

> +static inline struct malloc_chunk *

> +fastbin_push_entry (struct malloc_chunk **root, struct malloc_chunk

> *e) +{

> +  struct malloc_chunk *head;

> +  if (SINGLE_THREAD_P)

> +    {

> +      /* Check that the top of the bin is not the record we are going

> +	 to add (i.e., double free).  */

> +      head = *root;

> +      if (head == e)

> +	malloc_printerr ("double free or corruption (fasttop)");

> +      e->fd = head;

> +      *root = e;

> +    }

> +  else

> +    do

> +      {

> +	/* Synchronize with the release release MO CAS below.  We do

> +	   not need synchronization locally, but fastbin_pop_entry

> and

> +	   (especially) malloc_consolidate read the entire list after

> +	   synchronizing on the head, so we need to make sure that

> the

> +	   writes to the next (fd) pointers have happened.  */

> +	head = atomic_load_acquire (root);

> +	/* Check that the top of the bin is not the record we are

> +	   going to add (i.e., double free).  */

> +	if (head == e)

> +	  malloc_printerr ("double free or corruption (fasttop)");

> +	e->fd = head;

> +      }

> +  /* Synchronizes with the acquire MO CAS in  */

> +    while (!atomic_compare_exchange_weak_release (root, &head, e));

> +  return head;

> +}

> +

> +/* Remove an item from the atomic fastbin list at *ROOT.  The caller

> +   must have acquired the arena lock.  */

> +static inline struct malloc_chunk *

> +fastbin_pop_entry (struct malloc_chunk **root)

> +{

> +  struct malloc_chunk *head;

> +  if (SINGLE_THREAD_P)

> +    {

> +      head = *root;

> +      if (head != NULL)

> +	*root = head->fd;

> +    }

> +  else

> +    {

> +      /* Synchromizes with the release MO store in

> fastbin_push_entry.

> +	 Synchronization is needed because we read the next list

> +	 pointer.  */

> +      head = atomic_load_acquire (root);

> +      struct malloc_chunk *tail;

> +      do

> +	if (head == NULL)

> +	  return NULL;

> +	else

> +	  tail = head->fd;

> +      /* Synchronizes with the release MO store in

> fastbin_push_entry.

> +	 We do not have an ABA issue here because the caller has

> +	 acquired the arena lock, which ensures that there is only

> one

> +	 thread which removes elements from this list.  */

> +      while (!atomic_compare_exchange_weak_acquire (root, &head,

> tail));

> +    }

> +  return head;

> +}

> +

>  #pragma GCC poison mchunk_size

>  #pragma GCC poison mchunk_prev_size

>  

> @@ -3559,63 +3630,36 @@ _int_malloc (mstate av, size_t bytes)

>       can try it without checking, which saves some time on this fast

> path. */

>  

> -#define REMOVE_FB(fb, victim, pp)			\

> -  do							\

> -    {							\

> -      victim = pp;					\

> -      if (victim == NULL)				\

> -	break;						\

> -    }							\

> -  while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd,

> victim)) \

> -	 != victim);					\

> -

>    if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))

>      {

>        idx = fastbin_index (nb);

>        mfastbinptr *fb = &fastbin (av, idx);

> -      mchunkptr pp;

> -      victim = *fb;

> -

> +      victim = fastbin_pop_entry (fb);

>        if (victim != NULL)

>  	{

> -	  if (SINGLE_THREAD_P)

> -	    *fb = victim->fd;

> -	  else

> -	    REMOVE_FB (fb, pp, victim);

> -	  if (__glibc_likely (victim != NULL))

> -	    {

> -	      size_t victim_idx = fastbin_index (chunksize (victim));

> -	      if (__builtin_expect (victim_idx != idx, 0))

> -		malloc_printerr ("malloc(): memory corruption

> (fast)");

> -	      check_remalloced_chunk (av, victim, nb);

> +	  size_t victim_idx = fastbin_index (chunksize (victim));

> +	  if (victim_idx != idx)

> +	    malloc_printerr ("malloc(): memory corruption (fast)");

> +	  check_remalloced_chunk (av, victim, nb);

>  #if USE_TCACHE

> -	      /* While we're here, if we see other chunks of the

> same size,

> -		 stash them in the tcache.  */

> -	      size_t tc_idx = csize2tidx (nb);

> -	      if (tcache && tc_idx < mp_.tcache_bins)

> +	  /* While we're here, if we see other chunks of the same

> size,

> +	     stash them in the tcache.  */

> +	  size_t tc_idx = csize2tidx (nb);

> +	  if (tcache && tc_idx < mp_.tcache_bins)

> +	    {

> +	      /* While bin not empty and tcache not full, copy

> chunks.  */

> +	      while (tcache->counts[tc_idx] < mp_.tcache_count)

>  		{

> -		  mchunkptr tc_victim;

> -

> -		  /* While bin not empty and tcache not full, copy

> chunks.  */

> -		  while (tcache->counts[tc_idx] < mp_.tcache_count

> -			 && (tc_victim = *fb) != NULL)

> -		    {

> -		      if (SINGLE_THREAD_P)

> -			*fb = tc_victim->fd;

> -		      else

> -			{

> -			  REMOVE_FB (fb, pp, tc_victim);

> -			  if (__glibc_unlikely (tc_victim == NULL))

> -			    break;

> -			}

> -		      tcache_put (tc_victim, tc_idx);

> -		    }

> +		  mchunkptr tc_victim = fastbin_pop_entry (fb);

> +		  if (tc_victim == NULL)

> +		    break;

> +		  tcache_put (tc_victim, tc_idx);

>  		}

> -#endif

> -	      void *p = chunk2mem (victim);

> -	      alloc_perturb (p, bytes);

> -	      return p;

>  	    }

> +#endif

> +	  void *p = chunk2mem (victim);

> +	  alloc_perturb (p, bytes);

> +	  return p;

>  	}

>      }

>  

> @@ -4227,28 +4271,7 @@ _int_free (mstate av, mchunkptr p, int

> have_lock) fb = &fastbin (av, idx);

>  

>      /* Atomically link P to its fastbin: P->FD = *FB; *FB = P;  */

> -    mchunkptr old = *fb, old2;

> -

> -    if (SINGLE_THREAD_P)

> -      {

> -	/* Check that the top of the bin is not the record we are

> going to

> -	   add (i.e., double free).  */

> -	if (__builtin_expect (old == p, 0))

> -	  malloc_printerr ("double free or corruption (fasttop)");

> -	p->fd = old;

> -	*fb = p;

> -      }

> -    else

> -      do

> -	{

> -	  /* Check that the top of the bin is not the record we are

> going to

> -	     add (i.e., double free).  */

> -	  if (__builtin_expect (old == p, 0))

> -	    malloc_printerr ("double free or corruption (fasttop)");

> -	  p->fd = old2 = old;

> -	}

> -      while ((old = catomic_compare_and_exchange_val_rel (fb, p,

> old2))

> -	     != old2);

> +    mchunkptr old = fastbin_push_entry (fb, p);

>  

>      /* Check that size of fastbin chunk at the top is the same as

>         size of the chunk that we are adding.  We can dereference OLD

> @@ -4439,7 +4462,9 @@ static void malloc_consolidate(mstate av)

>    maxfb = &fastbin (av, NFASTBINS - 1);

>    fb = &fastbin (av, 0);

>    do {

> -    p = atomic_exchange_acq (fb, NULL);

> +    /* Synchronizes with the release MO store in

> +       fastbin_push_entry.  */

> +    p = atomic_exchange_acquire (fb, NULL);

>      if (p != 0) {

>        do {

>  	{
Florian Weimer Jan. 15, 2019, 10:35 p.m. | #2
* Anton Blanchard:

> Hi Florian,

>

>> This is another cleanup patch in preparation of the extended heap

>> protector (which will cover the fd/bk/fd_nextsize/bk_nextsize fields

>> in struct malloc_chunk, too).

>> 

>> By the way, there is an optimization opportunity for the tcache

>> backfill in _int_malloc: After the acquire MO load on the fastbin

>> list head, we can traverse the fastbin list as far as we need in

>> order to refill the tcache, and update the new list head with a

>> single CAS.  This does not have races (ABA races and the like)

>> because we have acquired the arena lock at this point.  Some backoff

>> is probably needed in case the fastbin list head is contended.  But

>> it is probably a good idea to do the first traversal at least once.

>

> I see a 16% regression on ppc64le with a simple threaded malloc test

> case. I guess the C11 atomics aren't as good as what we have in glibc.


Uh-oh.  Would you please check if replacing the two atomic_load_acquire
with atomic_load_relaxed restore the previous performance?

I believe both loads need to be acquire MO under the C11 memory model
(see the comments why), but the old code did not have them.

Thanks,
Florian
Anton Blanchard Jan. 16, 2019, 6:18 a.m. | #3
Hi Florian,

> > I see a 16% regression on ppc64le with a simple threaded malloc test

> > case. I guess the C11 atomics aren't as good as what we have in

> > glibc.  

> 

> Uh-oh.  Would you please check if replacing the two

> atomic_load_acquire with atomic_load_relaxed restore the previous

> performance?


As you suspect, doing this does restore the performance. The two lwsync
barrier instructions must be causing the slow down.

> I believe both loads need to be acquire MO under the C11 memory model

> (see the comments why), but the old code did not have them.


Ok, thanks for looking into it.

Thanks,
Anton
Florian Weimer Jan. 16, 2019, 12:30 p.m. | #4
* Anton Blanchard:

> Hi Florian,

>

>> > I see a 16% regression on ppc64le with a simple threaded malloc test

>> > case. I guess the C11 atomics aren't as good as what we have in

>> > glibc.  

>> 

>> Uh-oh.  Would you please check if replacing the two

>> atomic_load_acquire with atomic_load_relaxed restore the previous

>> performance?

>

> As you suspect, doing this does restore the performance. The two lwsync

> barrier instructions must be causing the slow down.


Okay, I'll post a patch to revert that commit.

However, the old code had this in _int_malloc (where the arena lock is
acquired):

#define REMOVE_FB(fb, victim, pp)                       \
  do                                                    \
    {                                                   \
      victim = pp;                                      \
      if (victim == NULL)                               \
        break;                                          \
    }                                                   \
  while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \
         != victim);                                    \
…
            REMOVE_FB (fb, pp, victim);

And this in _int_free (without the arena lock):

      do
        {
          /* Check that the top of the bin is not the record we are going to
             add (i.e., double free).  */
          if (__builtin_expect (old == p, 0))
            malloc_printerr ("double free or corruption (fasttop)");
          p->fd = old2 = old;
        }
      while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
             != old2);

I really don't see what makes sure that the store of p->fd happens
before the load of victim->fd.

It works out on POWER for some reason.  I'm attaching a test case that
should exercise these two code paths.

Thanks,
Florian
/* Run with: GLIBC_TUNABLES=glibc.malloc.tcache_count=0 */

#include <err.h>
#include <errno.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static void
usage (char **argv)
{
  fprintf (stderr, "usage: %s THREADS FREES-PER-THREAD\n", argv[0]);
  exit (1);
}

static void
check (const char *function, int ret)
{
  if (ret != 0)
    {
      errno = ret;
      err (1, "%s", function);
    }
}

static pthread_barrier_t barrier;

static void
xpthread_barrier_wait (pthread_barrier_t *barrier)
{
  int ret = pthread_barrier_wait (barrier);
  if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD)
    {
      errno = ret;
      err (1, "pthread_barrier_wait");
    }
}

static int per_thread;

static void *
thread_routine (void *closure)
{
  char **allocations = closure;
  int count = per_thread;
  while (true)
    {
      /* Wait for the main thread to perform allocations.  */
      xpthread_barrier_wait (&barrier);

      for (int i = 0; i < count; ++i)
        {
          if (allocations[i] == NULL)
            errx (1, "allocation is NULL");
          free (allocations[i]);
        }

      /* Signal to the main thread that new allocations can be
         performed.  */
      xpthread_barrier_wait (&barrier);
    }
}

static void
make_allocations (char **allocations, int count)
{
  for (int i = 0; i < count; ++i)
    {
      size_t size = 120; /* 128 byte cache line, 8 bytes malloc overhead.  */
      allocations[i] = malloc (size);
      if (allocations[i] == NULL)
        err (1, "malloc");
      memset (allocations[i], 0xc0, size);
    }
}

int
main (int argc, char **argv)
{
  if (argc != 3)
    usage (argv);
  int thread_count = atoi (argv[1]);
  if (thread_count <= 0)
    usage (argv);
  per_thread = atoi (argv[2]);
  if (per_thread <= 0)
    usage (argv);

  check ("pthread_barrier_init",
         pthread_barrier_init (&barrier, NULL, thread_count + 1));
  pthread_attr_t attr;
  check ("pthread_attr_init", pthread_attr_init (&attr));
  check ("pthread_attr_setstacksize",
         pthread_attr_setstacksize (&attr, 256 * 1024));

  pthread_t *threads = calloc (thread_count, sizeof (*threads));
  if (threads == NULL)
    err (1, "calloc");
  char **allocations = calloc (thread_count * per_thread,
                               sizeof (*allocations));
  if (allocations == NULL)
    err (1, "calloc");
  char **next_allocations = calloc (thread_count * per_thread,
                                    sizeof (*allocations));
  if (next_allocations == NULL)
    err (1, "calloc");

  for (int i = 0; i < thread_count; ++i)
    check ("pthread_create",
           pthread_create (threads + i, &attr,
                           thread_routine, allocations + i * per_thread));
  make_allocations (allocations, thread_count * per_thread);

  while (true)
    {
      xpthread_barrier_wait (&barrier);
      /* Threads perform deallocations here.  */
      make_allocations (next_allocations, thread_count * per_thread);
      xpthread_barrier_wait (&barrier);

      memcpy (allocations, next_allocations,
              thread_count * per_thread * sizeof (*allocations));
    }
}
Carlos O'Donell Jan. 16, 2019, 3:39 p.m. | #5
On 1/16/19 7:30 AM, Florian Weimer wrote:
> * Anton Blanchard:

> 

>> Hi Florian,

>>

>>>> I see a 16% regression on ppc64le with a simple threaded malloc test

>>>> case. I guess the C11 atomics aren't as good as what we have in

>>>> glibc.  

>>>

>>> Uh-oh.  Would you please check if replacing the two

>>> atomic_load_acquire with atomic_load_relaxed restore the previous

>>> performance?

>>

>> As you suspect, doing this does restore the performance. The two lwsync

>> barrier instructions must be causing the slow down.

> 

> Okay, I'll post a patch to revert that commit.

> 

> However, the old code had this in _int_malloc (where the arena lock is

> acquired):

> 

> #define REMOVE_FB(fb, victim, pp)                       \

>   do                                                    \

>     {                                                   \

>       victim = pp;                                      \

>       if (victim == NULL)                               \

>         break;                                          \

>     }                                                   \

>   while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \

>          != victim);                                    \

> …

>             REMOVE_FB (fb, pp, victim);

> 

> And this in _int_free (without the arena lock):

> 

>       do

>         {

>           /* Check that the top of the bin is not the record we are going to

>              add (i.e., double free).  */

>           if (__builtin_expect (old == p, 0))

>             malloc_printerr ("double free or corruption (fasttop)");

>           p->fd = old2 = old;

>         }

>       while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))

>              != old2);

> 

> I really don't see what makes sure that the store of p->fd happens

> before the load of victim->fd.

> 

> It works out on POWER for some reason.  I'm attaching a test case that

> should exercise these two code paths.


This is still not a root cause analysis of the problem.

I think it is OK to revert your patch under the pressure of the upcoming
release and the fact that this is a serious performance regression.

However, there is some due diligence that needs to be done here, and I
think the patch should *right back in* as soon as master opens. We already
have *lots* of other code using C11-style atomics in glibc, and we want
to make sure those work as best we can. This issue is either one of two
things:

* non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can
  eventually become corrupted or worse see unbounded growth as we
  corrupt the chunk list.

or

* non-C11 atomics are more optimal for C11 atomics and we can adjust
  the C11 atomics to be as good as the non-C11 versions with a little
  help from Arm and IBM.

We have to be able to prove the fastbin access is correct.

-- 
Cheers,
Carlos.
Florian Weimer Jan. 16, 2019, 4:04 p.m. | #6
* Carlos O'Donell:

> On 1/16/19 7:30 AM, Florian Weimer wrote:

>> * Anton Blanchard:

>> 

>>> Hi Florian,

>>>

>>>>> I see a 16% regression on ppc64le with a simple threaded malloc test

>>>>> case. I guess the C11 atomics aren't as good as what we have in

>>>>> glibc.  

>>>>

>>>> Uh-oh.  Would you please check if replacing the two

>>>> atomic_load_acquire with atomic_load_relaxed restore the previous

>>>> performance?

>>>

>>> As you suspect, doing this does restore the performance. The two lwsync

>>> barrier instructions must be causing the slow down.

>> 

>> Okay, I'll post a patch to revert that commit.

>> 

>> However, the old code had this in _int_malloc (where the arena lock is

>> acquired):

>> 

>> #define REMOVE_FB(fb, victim, pp)                       \

>>   do                                                    \

>>     {                                                   \

>>       victim = pp;                                      \

>>       if (victim == NULL)                               \

>>         break;                                          \

>>     }                                                   \

>>   while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \

>>          != victim);                                    \

>> …

>>             REMOVE_FB (fb, pp, victim);

>> 

>> And this in _int_free (without the arena lock):

>> 

>>       do

>>         {

>>           /* Check that the top of the bin is not the record we are going to

>>              add (i.e., double free).  */

>>           if (__builtin_expect (old == p, 0))

>>             malloc_printerr ("double free or corruption (fasttop)");

>>           p->fd = old2 = old;

>>         }

>>       while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))

>>              != old2);

>> 

>> I really don't see what makes sure that the store of p->fd happens

>> before the load of victim->fd.

>> 

>> It works out on POWER for some reason.  I'm attaching a test case that

>> should exercise these two code paths.

>

> This is still not a root cause analysis of the problem.


We know the root of the performance problem, the additional
synchronization, and, on Aarch64, a consequence of what was supposed to
be a minor algorithmic change.

> I think it is OK to revert your patch under the pressure of the upcoming

> release and the fact that this is a serious performance regression.

>

> However, there is some due diligence that needs to be done here, and I

> think the patch should *right back in* as soon as master opens. We already

> have *lots* of other code using C11-style atomics in glibc, and we want

> to make sure those work as best we can. This issue is either one of two

> things:

>

> * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can

>   eventually become corrupted or worse see unbounded growth as we

>   corrupt the chunk list.

>

> or

>

> * non-C11 atomics are more optimal for C11 atomics and we can adjust

>   the C11 atomics to be as good as the non-C11 versions with a little

>   help from Arm and IBM.


It's also possible there's a different algorithm that's fast to
implement with C11 atomics.

I'm not a concurrency expert, but I really don't see a way to avoid the
additional barriers with the current algorithm.  It would also be worth
checking if fastbins are even a win on these architectures.

Thanks,
Florian
Carlos O'Donell Jan. 16, 2019, 4:30 p.m. | #7
On 1/16/19 11:04 AM, Florian Weimer wrote:
> * Carlos O'Donell:

> 

>> On 1/16/19 7:30 AM, Florian Weimer wrote:

>>> * Anton Blanchard:

>>>

>>>> Hi Florian,

>>>>

>>>>>> I see a 16% regression on ppc64le with a simple threaded malloc test

>>>>>> case. I guess the C11 atomics aren't as good as what we have in

>>>>>> glibc.  

>>>>>

>>>>> Uh-oh.  Would you please check if replacing the two

>>>>> atomic_load_acquire with atomic_load_relaxed restore the previous

>>>>> performance?

>>>>

>>>> As you suspect, doing this does restore the performance. The two lwsync

>>>> barrier instructions must be causing the slow down.

>>>

>>> Okay, I'll post a patch to revert that commit.

>>>

>>> However, the old code had this in _int_malloc (where the arena lock is

>>> acquired):

>>>

>>> #define REMOVE_FB(fb, victim, pp)                       \

>>>   do                                                    \

>>>     {                                                   \

>>>       victim = pp;                                      \

>>>       if (victim == NULL)                               \

>>>         break;                                          \

>>>     }                                                   \

>>>   while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \

>>>          != victim);                                    \

>>> …

>>>             REMOVE_FB (fb, pp, victim);

>>>

>>> And this in _int_free (without the arena lock):

>>>

>>>       do

>>>         {

>>>           /* Check that the top of the bin is not the record we are going to

>>>              add (i.e., double free).  */

>>>           if (__builtin_expect (old == p, 0))

>>>             malloc_printerr ("double free or corruption (fasttop)");

>>>           p->fd = old2 = old;

>>>         }

>>>       while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))

>>>              != old2);

>>>

>>> I really don't see what makes sure that the store of p->fd happens

>>> before the load of victim->fd.

>>>

>>> It works out on POWER for some reason.  I'm attaching a test case that

>>> should exercise these two code paths.

>>

>> This is still not a root cause analysis of the problem.

> 

> We know the root of the performance problem, the additional

> synchronization, and, on Aarch64, a consequence of what was supposed to

> be a minor algorithmic change.


OK, I hadn't seen an analysis of the instruction sequences, so I wasn't
sure if this was completely understood.

This begs the question: Is the existing code correct for Aarch64 and POWER64?

I guess we don't know without serious review.

>> I think it is OK to revert your patch under the pressure of the upcoming

>> release and the fact that this is a serious performance regression.

>>

>> However, there is some due diligence that needs to be done here, and I

>> think the patch should *right back in* as soon as master opens. We already

>> have *lots* of other code using C11-style atomics in glibc, and we want

>> to make sure those work as best we can. This issue is either one of two

>> things:

>>

>> * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can

>>   eventually become corrupted or worse see unbounded growth as we

>>   corrupt the chunk list.

>>

>> or

>>

>> * non-C11 atomics are more optimal for C11 atomics and we can adjust

>>   the C11 atomics to be as good as the non-C11 versions with a little

>>   help from Arm and IBM.

> 

> It's also possible there's a different algorithm that's fast to

> implement with C11 atomics.


Or the existing C11-style atomics for Aarch64 and POWER64 are pessimistic
and we might do better.

> I'm not a concurrency expert, but I really don't see a way to avoid the

> additional barriers with the current algorithm.  It would also be worth

> checking if fastbins are even a win on these architectures.


Right, it needs a review.

At least on x86_64 when we did the tcache vs. fastbin testing the addition
of fastbin was still a win.

We'd have to run the simulator and workload on Aarch64 to see, which we
can ask Arm to do... and IBM for POWER64.

-- 
Cheers,
Carlos.
Szabolcs Nagy Jan. 16, 2019, 4:52 p.m. | #8
On 16/01/2019 16:30, Carlos O'Donell wrote:
> On 1/16/19 11:04 AM, Florian Weimer wrote:

>> * Carlos O'Donell:

>>

>>> On 1/16/19 7:30 AM, Florian Weimer wrote:

>>>> * Anton Blanchard:

>>>>

>>>>> Hi Florian,

>>>>>

>>>>>>> I see a 16% regression on ppc64le with a simple threaded malloc test

>>>>>>> case. I guess the C11 atomics aren't as good as what we have in

>>>>>>> glibc.  

>>>>>>

>>>>>> Uh-oh.  Would you please check if replacing the two

>>>>>> atomic_load_acquire with atomic_load_relaxed restore the previous

>>>>>> performance?

>>>>>

>>>>> As you suspect, doing this does restore the performance. The two lwsync

>>>>> barrier instructions must be causing the slow down.

>>>>

>>>> Okay, I'll post a patch to revert that commit.

>>>>

>>>> However, the old code had this in _int_malloc (where the arena lock is

>>>> acquired):

>>>>

>>>> #define REMOVE_FB(fb, victim, pp)                       \

>>>>   do                                                    \

>>>>     {                                                   \

>>>>       victim = pp;                                      \

>>>>       if (victim == NULL)                               \

>>>>         break;                                          \

>>>>     }                                                   \

>>>>   while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \

>>>>          != victim);                                    \

>>>> …

>>>>             REMOVE_FB (fb, pp, victim);

>>>>

>>>> And this in _int_free (without the arena lock):

>>>>

>>>>       do

>>>>         {

>>>>           /* Check that the top of the bin is not the record we are going to

>>>>              add (i.e., double free).  */

>>>>           if (__builtin_expect (old == p, 0))

>>>>             malloc_printerr ("double free or corruption (fasttop)");

>>>>           p->fd = old2 = old;

>>>>         }

>>>>       while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))

>>>>              != old2);

>>>>

>>>> I really don't see what makes sure that the store of p->fd happens

>>>> before the load of victim->fd.

>>>>

>>>> It works out on POWER for some reason.  I'm attaching a test case that

>>>> should exercise these two code paths.

>>>

>>> This is still not a root cause analysis of the problem.

>>

>> We know the root of the performance problem, the additional

>> synchronization, and, on Aarch64, a consequence of what was supposed to

>> be a minor algorithmic change.

> 

> OK, I hadn't seen an analysis of the instruction sequences, so I wasn't

> sure if this was completely understood.

> 

> This begs the question: Is the existing code correct for Aarch64 and POWER64?

> 

> I guess we don't know without serious review.

> 

>>> I think it is OK to revert your patch under the pressure of the upcoming

>>> release and the fact that this is a serious performance regression.

>>>

>>> However, there is some due diligence that needs to be done here, and I

>>> think the patch should *right back in* as soon as master opens. We already

>>> have *lots* of other code using C11-style atomics in glibc, and we want

>>> to make sure those work as best we can. This issue is either one of two

>>> things:

>>>

>>> * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can

>>>   eventually become corrupted or worse see unbounded growth as we

>>>   corrupt the chunk list.

>>>

>>> or

>>>

>>> * non-C11 atomics are more optimal for C11 atomics and we can adjust

>>>   the C11 atomics to be as good as the non-C11 versions with a little

>>>   help from Arm and IBM.

>>

>> It's also possible there's a different algorithm that's fast to

>> implement with C11 atomics.

> 

> Or the existing C11-style atomics for Aarch64 and POWER64 are pessimistic

> and we might do better.

> 

>> I'm not a concurrency expert, but I really don't see a way to avoid the

>> additional barriers with the current algorithm.  It would also be worth

>> checking if fastbins are even a win on these architectures.

> 

> Right, it needs a review.

> 

> At least on x86_64 when we did the tcache vs. fastbin testing the addition

> of fastbin was still a win.

> 

> We'd have to run the simulator and workload on Aarch64 to see, which we

> can ask Arm to do... and IBM for POWER64.

> 


we will look into the regression.

it is not yet clear what causes it.
Carlos O'Donell Jan. 16, 2019, 4:55 p.m. | #9
On 1/16/19 11:52 AM, Szabolcs Nagy wrote:
> On 16/01/2019 16:30, Carlos O'Donell wrote:

>> On 1/16/19 11:04 AM, Florian Weimer wrote:

>>> * Carlos O'Donell:

>>>

>>>> On 1/16/19 7:30 AM, Florian Weimer wrote:

>>>>> * Anton Blanchard:

>>>>>

>>>>>> Hi Florian,

>>>>>>

>>>>>>>> I see a 16% regression on ppc64le with a simple threaded malloc test

>>>>>>>> case. I guess the C11 atomics aren't as good as what we have in

>>>>>>>> glibc.  

>>>>>>>

>>>>>>> Uh-oh.  Would you please check if replacing the two

>>>>>>> atomic_load_acquire with atomic_load_relaxed restore the previous

>>>>>>> performance?

>>>>>>

>>>>>> As you suspect, doing this does restore the performance. The two lwsync

>>>>>> barrier instructions must be causing the slow down.

>>>>>

>>>>> Okay, I'll post a patch to revert that commit.

>>>>>

>>>>> However, the old code had this in _int_malloc (where the arena lock is

>>>>> acquired):

>>>>>

>>>>> #define REMOVE_FB(fb, victim, pp)                       \

>>>>>   do                                                    \

>>>>>     {                                                   \

>>>>>       victim = pp;                                      \

>>>>>       if (victim == NULL)                               \

>>>>>         break;                                          \

>>>>>     }                                                   \

>>>>>   while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \

>>>>>          != victim);                                    \

>>>>> …

>>>>>             REMOVE_FB (fb, pp, victim);

>>>>>

>>>>> And this in _int_free (without the arena lock):

>>>>>

>>>>>       do

>>>>>         {

>>>>>           /* Check that the top of the bin is not the record we are going to

>>>>>              add (i.e., double free).  */

>>>>>           if (__builtin_expect (old == p, 0))

>>>>>             malloc_printerr ("double free or corruption (fasttop)");

>>>>>           p->fd = old2 = old;

>>>>>         }

>>>>>       while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))

>>>>>              != old2);

>>>>>

>>>>> I really don't see what makes sure that the store of p->fd happens

>>>>> before the load of victim->fd.

>>>>>

>>>>> It works out on POWER for some reason.  I'm attaching a test case that

>>>>> should exercise these two code paths.

>>>>

>>>> This is still not a root cause analysis of the problem.

>>>

>>> We know the root of the performance problem, the additional

>>> synchronization, and, on Aarch64, a consequence of what was supposed to

>>> be a minor algorithmic change.

>>

>> OK, I hadn't seen an analysis of the instruction sequences, so I wasn't

>> sure if this was completely understood.

>>

>> This begs the question: Is the existing code correct for Aarch64 and POWER64?

>>

>> I guess we don't know without serious review.

>>

>>>> I think it is OK to revert your patch under the pressure of the upcoming

>>>> release and the fact that this is a serious performance regression.

>>>>

>>>> However, there is some due diligence that needs to be done here, and I

>>>> think the patch should *right back in* as soon as master opens. We already

>>>> have *lots* of other code using C11-style atomics in glibc, and we want

>>>> to make sure those work as best we can. This issue is either one of two

>>>> things:

>>>>

>>>> * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can

>>>>   eventually become corrupted or worse see unbounded growth as we

>>>>   corrupt the chunk list.

>>>>

>>>> or

>>>>

>>>> * non-C11 atomics are more optimal for C11 atomics and we can adjust

>>>>   the C11 atomics to be as good as the non-C11 versions with a little

>>>>   help from Arm and IBM.

>>>

>>> It's also possible there's a different algorithm that's fast to

>>> implement with C11 atomics.

>>

>> Or the existing C11-style atomics for Aarch64 and POWER64 are pessimistic

>> and we might do better.

>>

>>> I'm not a concurrency expert, but I really don't see a way to avoid the

>>> additional barriers with the current algorithm.  It would also be worth

>>> checking if fastbins are even a win on these architectures.

>>

>> Right, it needs a review.

>>

>> At least on x86_64 when we did the tcache vs. fastbin testing the addition

>> of fastbin was still a win.

>>

>> We'd have to run the simulator and workload on Aarch64 to see, which we

>> can ask Arm to do... and IBM for POWER64.

>>

> 

> we will look into the regression.

> 

> it is not yet clear what causes it.

> 


Awesome. Thank you for that. To be clear my worry is that we're using 
C11-style atomics all over the place in glibc now, and if they are not 
correct or really slow we should understand why.

This particular change is OK for for me, I fully understand you want
the performance back to level it was at before.

-- 
Cheers,
Carlos.

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index bfc605aa3e..7c2186c307 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1316,6 +1316,77 @@  nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 #define set_foot(p, s)       (((mchunkptr) ((char *) (p) + (s)))->mchunk_prev_size = (s))
 
 
+/* Add an item to the atomic fastbin list at *ROOT.  Returns the old
+   value at *ROOT.  Note that properties of the old chunk are only
+   stable if the caller has acquired the arena lock.  With out the
+   lock, it can be deallocated at any time.  */
+static inline struct malloc_chunk *
+fastbin_push_entry (struct malloc_chunk **root, struct malloc_chunk *e)
+{
+  struct malloc_chunk *head;
+  if (SINGLE_THREAD_P)
+    {
+      /* Check that the top of the bin is not the record we are going
+	 to add (i.e., double free).  */
+      head = *root;
+      if (head == e)
+	malloc_printerr ("double free or corruption (fasttop)");
+      e->fd = head;
+      *root = e;
+    }
+  else
+    do
+      {
+	/* Synchronize with the release release MO CAS below.  We do
+	   not need synchronization locally, but fastbin_pop_entry and
+	   (especially) malloc_consolidate read the entire list after
+	   synchronizing on the head, so we need to make sure that the
+	   writes to the next (fd) pointers have happened.  */
+	head = atomic_load_acquire (root);
+	/* Check that the top of the bin is not the record we are
+	   going to add (i.e., double free).  */
+	if (head == e)
+	  malloc_printerr ("double free or corruption (fasttop)");
+	e->fd = head;
+      }
+  /* Synchronizes with the acquire MO CAS in  */
+    while (!atomic_compare_exchange_weak_release (root, &head, e));
+  return head;
+}
+
+/* Remove an item from the atomic fastbin list at *ROOT.  The caller
+   must have acquired the arena lock.  */
+static inline struct malloc_chunk *
+fastbin_pop_entry (struct malloc_chunk **root)
+{
+  struct malloc_chunk *head;
+  if (SINGLE_THREAD_P)
+    {
+      head = *root;
+      if (head != NULL)
+	*root = head->fd;
+    }
+  else
+    {
+      /* Synchromizes with the release MO store in fastbin_push_entry.
+	 Synchronization is needed because we read the next list
+	 pointer.  */
+      head = atomic_load_acquire (root);
+      struct malloc_chunk *tail;
+      do
+	if (head == NULL)
+	  return NULL;
+	else
+	  tail = head->fd;
+      /* Synchronizes with the release MO store in fastbin_push_entry.
+	 We do not have an ABA issue here because the caller has
+	 acquired the arena lock, which ensures that there is only one
+	 thread which removes elements from this list.  */
+      while (!atomic_compare_exchange_weak_acquire (root, &head, tail));
+    }
+  return head;
+}
+
 #pragma GCC poison mchunk_size
 #pragma GCC poison mchunk_prev_size
 
@@ -3559,63 +3630,36 @@  _int_malloc (mstate av, size_t bytes)
      can try it without checking, which saves some time on this fast path.
    */
 
-#define REMOVE_FB(fb, victim, pp)			\
-  do							\
-    {							\
-      victim = pp;					\
-      if (victim == NULL)				\
-	break;						\
-    }							\
-  while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \
-	 != victim);					\
-
   if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
     {
       idx = fastbin_index (nb);
       mfastbinptr *fb = &fastbin (av, idx);
-      mchunkptr pp;
-      victim = *fb;
-
+      victim = fastbin_pop_entry (fb);
       if (victim != NULL)
 	{
-	  if (SINGLE_THREAD_P)
-	    *fb = victim->fd;
-	  else
-	    REMOVE_FB (fb, pp, victim);
-	  if (__glibc_likely (victim != NULL))
-	    {
-	      size_t victim_idx = fastbin_index (chunksize (victim));
-	      if (__builtin_expect (victim_idx != idx, 0))
-		malloc_printerr ("malloc(): memory corruption (fast)");
-	      check_remalloced_chunk (av, victim, nb);
+	  size_t victim_idx = fastbin_index (chunksize (victim));
+	  if (victim_idx != idx)
+	    malloc_printerr ("malloc(): memory corruption (fast)");
+	  check_remalloced_chunk (av, victim, nb);
 #if USE_TCACHE
-	      /* While we're here, if we see other chunks of the same size,
-		 stash them in the tcache.  */
-	      size_t tc_idx = csize2tidx (nb);
-	      if (tcache && tc_idx < mp_.tcache_bins)
+	  /* While we're here, if we see other chunks of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = csize2tidx (nb);
+	  if (tcache && tc_idx < mp_.tcache_bins)
+	    {
+	      /* While bin not empty and tcache not full, copy chunks.  */
+	      while (tcache->counts[tc_idx] < mp_.tcache_count)
 		{
-		  mchunkptr tc_victim;
-
-		  /* While bin not empty and tcache not full, copy chunks.  */
-		  while (tcache->counts[tc_idx] < mp_.tcache_count
-			 && (tc_victim = *fb) != NULL)
-		    {
-		      if (SINGLE_THREAD_P)
-			*fb = tc_victim->fd;
-		      else
-			{
-			  REMOVE_FB (fb, pp, tc_victim);
-			  if (__glibc_unlikely (tc_victim == NULL))
-			    break;
-			}
-		      tcache_put (tc_victim, tc_idx);
-		    }
+		  mchunkptr tc_victim = fastbin_pop_entry (fb);
+		  if (tc_victim == NULL)
+		    break;
+		  tcache_put (tc_victim, tc_idx);
 		}
-#endif
-	      void *p = chunk2mem (victim);
-	      alloc_perturb (p, bytes);
-	      return p;
 	    }
+#endif
+	  void *p = chunk2mem (victim);
+	  alloc_perturb (p, bytes);
+	  return p;
 	}
     }
 
@@ -4227,28 +4271,7 @@  _int_free (mstate av, mchunkptr p, int have_lock)
     fb = &fastbin (av, idx);
 
     /* Atomically link P to its fastbin: P->FD = *FB; *FB = P;  */
-    mchunkptr old = *fb, old2;
-
-    if (SINGLE_THREAD_P)
-      {
-	/* Check that the top of the bin is not the record we are going to
-	   add (i.e., double free).  */
-	if (__builtin_expect (old == p, 0))
-	  malloc_printerr ("double free or corruption (fasttop)");
-	p->fd = old;
-	*fb = p;
-      }
-    else
-      do
-	{
-	  /* Check that the top of the bin is not the record we are going to
-	     add (i.e., double free).  */
-	  if (__builtin_expect (old == p, 0))
-	    malloc_printerr ("double free or corruption (fasttop)");
-	  p->fd = old2 = old;
-	}
-      while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
-	     != old2);
+    mchunkptr old = fastbin_push_entry (fb, p);
 
     /* Check that size of fastbin chunk at the top is the same as
        size of the chunk that we are adding.  We can dereference OLD
@@ -4439,7 +4462,9 @@  static void malloc_consolidate(mstate av)
   maxfb = &fastbin (av, NFASTBINS - 1);
   fb = &fastbin (av, 0);
   do {
-    p = atomic_exchange_acq (fb, NULL);
+    /* Synchronizes with the release MO store in
+       fastbin_push_entry.  */
+    p = atomic_exchange_acquire (fb, NULL);
     if (p != 0) {
       do {
 	{