diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c index dd9d99a6f4..512fc1248c 100644 --- a/nptl/pthread_cond_common.c +++ b/nptl/pthread_cond_common.c @@ -201,7 +201,6 @@ static bool __attribute__ ((unused)) __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, unsigned int *g1index, int private) { - const unsigned int maxspin = 0; unsigned int g1 = *g1index; /* If there is no waiter in G2, we don't do anything. The expression may @@ -222,84 +221,46 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, * New waiters arriving concurrently with the group switching will all go into G2 until we atomically make the switch. Waiters existing in G2 are not affected. - * Waiters in G1 will be closed out immediately by setting a flag in - __g_signals, which will prevent waiters from blocking using a futex on - __g_signals and also notifies them that the group is closed. As a - result, they will eventually remove their group reference, allowing us - to close switch group roles. */ - - /* First, set the closed flag on __g_signals. This tells waiters that are - about to wait that they shouldn't do that anymore. This basically - serves as an advance notification of the upcoming change to __g1_start; - waiters interpret it as if __g1_start was larger than their waiter - sequence position. This allows us to change __g1_start after waiting - for all existing waiters with group references to leave, which in turn - makes recovery after stealing a signal simpler because it then can be - skipped if __g1_start indicates that the group is closed (otherwise, - we would have to recover always because waiters don't know how big their - groups are). Relaxed MO is fine. */ - atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1); - - /* Wait until there are no group references anymore. The fetch-or operation - injects us into the modification order of __g_refs; release MO ensures - that waiters incrementing __g_refs after our fetch-or see the previous - changes to __g_signals and to __g1_start that had to happen before we can - switch this G1 and alias with an older group (we have two groups, so - aliasing requires switching group roles twice). Note that nobody else - can have set the wake-request flag, so we do not have to act upon it. - - Also note that it is harmless if older waiters or waiters from this G1 - get a group reference after we have quiesced the group because it will - remain closed for them either because of the closed flag in __g_signals - or the later update to __g1_start. New waiters will never arrive here - but instead continue to go into the still current G2. */ - unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0); - while ((r >> 1) > 0) - { - for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--) - { - /* TODO Back off. */ - r = atomic_load_relaxed (cond->__data.__g_refs + g1); - } - if ((r >> 1) > 0) - { - /* There is still a waiter after spinning. Set the wake-request - flag and block. Relaxed MO is fine because this is just about - this futex word. - - Update r to include the set wake-request flag so that the upcoming - futex_wait only blocks if the flag is still set (otherwise, we'd - violate the basic client-side futex protocol). */ - r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1; - - if ((r >> 1) > 0) - futex_wait_simple (cond->__data.__g_refs + g1, r, private); - /* Reload here so we eventually see the most recent value even if we - do not spin. */ - r = atomic_load_relaxed (cond->__data.__g_refs + g1); - } - } - /* Acquire MO so that we synchronize with the release operation that waiters - use to decrement __g_refs and thus happen after the waiters we waited - for. */ - atomic_thread_fence_acquire (); + * Waiters in G1 will be closed out immediately by the advancing of + __g_signals to the next "lowseq" (low 31 bits of the new g1_start), + which will prevent waiters from blocking using a futex on + __g_signals since it provides enough signals for all possible + remaining waiters. As a result, they can each consume a signal + and they will eventually remove their group reference. */ /* Update __g1_start, which finishes closing this group. The value we add will never be negative because old_orig_size can only be zero when we switch groups the first time after a condvar was initialized, in which - case G1 will be at index 1 and we will add a value of 1. See above for - why this takes place after waiting for quiescence of the group. + case G1 will be at index 1 and we will add a value of 1. Relaxed MO is fine because the change comes with no additional constraints that others would have to observe. */ __condvar_add_g1_start_relaxed (cond, (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); - /* Now reopen the group, thus enabling waiters to again block using the - futex controlled by __g_signals. Release MO so that observers that see - no signals (and thus can block) also see the write __g1_start and thus - that this is now a new group (see __pthread_cond_wait_common for the - matching acquire MO loads). */ - atomic_store_release (cond->__data.__g_signals + g1, 0); + unsigned int lowseq = ((old_g1_start + old_orig_size) << 1) & ~1U; + + /* If any waiters still hold group references (and thus could be blocked), + then wake them all up now and prevent any running ones from blocking. + This is effectively a catch-all for any possible current or future + bugs that can allow the group size to reach 0 before all G1 waiters + have been awakened or at least given signals to consume, or any + other case that can leave blocked (or about to block) older waiters.. */ + if ((atomic_fetch_or_release (cond->__data.__g_refs + g1, 0) >> 1) > 0) + { + /* First advance signals to the end of the group (i.e. enough signals + for the entire G1 group) to ensure that waiters which have not + yet blocked in the futex will not block. + Note that in the vast majority of cases, this should never + actually be necessary, since __g_signals will have enough + signals for the remaining g_refs waiters. As an optimization, + we could check this first before proceeding, although that + could still leave the potential for futex lost wakeup bugs + if the signal count was non-zero but the futex wakeup + was somehow lost. */ + atomic_store_release (cond->__data.__g_signals + g1, lowseq); + + futex_wake (cond->__data.__g_signals + g1, INT_MAX, private); + } /* At this point, the old G1 is now a valid new G2 (but not in use yet). No old waiter can neither grab a signal nor acquire a reference without @@ -311,6 +272,10 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, g1 ^= 1; *g1index ^= 1; + /* Now advance the new G1 g_signals to the new lowseq, giving it + an effective signal count of 0 to start. */ + atomic_store_release (cond->__data.__g_signals + g1, lowseq); + /* These values are just observed by signalers, and thus protected by the lock. */ unsigned int orig_size = wseq - (old_g1_start + old_orig_size); diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c index 78567f039b..7ac43db743 100644 --- a/nptl/pthread_cond_wait.c +++ b/nptl/pthread_cond_wait.c @@ -238,9 +238,7 @@ __condvar_cleanup_waiting (void *arg) signaled), and a reference count. The group reference count is used to maintain the number of waiters that - are using the group's futex. Before a group can change its role, the - reference count must show that no waiters are using the futex anymore; this - prevents ABA issues on the futex word. + are using the group's futex. To represent which intervals in the waiter sequence the groups cover (and thus also which group slot contains G1 or G2), we use a 64b counter to @@ -300,11 +298,12 @@ __condvar_cleanup_waiting (void *arg) last reference. * Reference count used by waiters concurrently with signalers that have acquired the condvar-internal lock. - __g_signals: The number of signals that can still be consumed. + __g_signals: The number of signals that can still be consumed, relative to + the current g1_start. (i.e. bits 31 to 1 of __g_signals are bits + 31 to 1 of g1_start with the signal count added) * Used as a futex word by waiters. Used concurrently by waiters and signalers. - * LSB is true iff this group has been completely signaled (i.e., it is - closed). + * LSB is currently reserved and 0. __g_size: Waiters remaining in this group (i.e., which have not been signaled yet. * Accessed by signalers and waiters that cancel waiting (both do so only @@ -328,18 +327,6 @@ __condvar_cleanup_waiting (void *arg) sufficient because if a waiter can see a sufficiently large value, it could have also consume a signal in the waiters group. - Waiters try to grab a signal from __g_signals without holding a reference - count, which can lead to stealing a signal from a more recent group after - their own group was already closed. They cannot always detect whether they - in fact did because they do not know when they stole, but they can - conservatively add a signal back to the group they stole from; if they - did so unnecessarily, all that happens is a spurious wake-up. To make this - even less likely, __g1_start contains the index of the current g2 too, - which allows waiters to check if there aliasing on the group slots; if - there wasn't, they didn't steal from the current G1, which means that the - G1 they stole from must have been already closed and they do not need to - fix anything. - It is essential that the last field in pthread_cond_t is __g_signals[1]: The previous condvar used a pointer-sized field in pthread_cond_t, so a PTHREAD_COND_INITIALIZER from that condvar implementation might only @@ -435,6 +422,9 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, { while (1) { + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); + unsigned int lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + /* Spin-wait first. Note that spinning first without checking whether a timeout passed might lead to what looks like a spurious wake-up even @@ -446,35 +436,45 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, having to compare against the current time seems to be the right choice from a performance perspective for most use cases. */ unsigned int spin = maxspin; - while (signals == 0 && spin > 0) + while (spin > 0 && ((int)(signals - lowseq) < 2)) { /* Check that we are not spinning on a group that's already closed. */ - if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) - goto done; + if (seq < (g1_start >> 1)) + break; /* TODO Back off. */ /* Reload signals. See above for MO. */ signals = atomic_load_acquire (cond->__data.__g_signals + g); + g1_start = __condvar_load_g1_start_relaxed (cond); + lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; spin--; } - /* If our group will be closed as indicated by the flag on signals, - don't bother grabbing a signal. */ - if (signals & 1) - goto done; + if (seq < (g1_start >> 1)) + { + /* If the group is closed already, + then this waiter originally had enough extra signals to + consume, up until the time its group was closed. */ + goto done; + } - /* If there is an available signal, don't block. */ - if (signals != 0) + /* If there is an available signal, don't block. + If __g1_start has advanced at all, then we must be in G1 + by now, perhaps in the process of switching back to an older + G2, but in either case we're allowed to consume the available + signal and should not block anymore. */ + if ((int)(signals - lowseq) >= 2) break; /* No signals available after spinning, so prepare to block. We first acquire a group reference and use acquire MO for that so that we synchronize with the dummy read-modify-write in __condvar_quiesce_and_switch_g1 if we read from that. In turn, - in this case this will make us see the closed flag on __g_signals - that designates a concurrent attempt to reuse the group's slot. + in this case this will make us see the advancement of __g_signals + to the upcoming new g1_start that occurs with a concurrent + attempt to reuse the group's slot. We use acquire MO for the __g_signals check to make the __g1_start check work (see spinning above). Note that the group reference acquisition will not mask the @@ -482,15 +482,24 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, an atomic read-modify-write operation and thus extend the release sequence. */ atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); - if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0) - || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))) + signals = atomic_load_acquire (cond->__data.__g_signals + g); + g1_start = __condvar_load_g1_start_relaxed (cond); + lowseq = (g1_start & 1) == g ? signals : g1_start & ~1U; + + if (seq < (g1_start >> 1)) { - /* Our group is closed. Wake up any signalers that might be - waiting. */ + /* group is closed already, so don't block */ __condvar_dec_grefs (cond, g, private); goto done; } + if ((int)(signals - lowseq) >= 2) + { + /* a signal showed up or G1/G2 switched after we grabbed the refcount */ + __condvar_dec_grefs (cond, g, private); + break; + } + // Now block. struct _pthread_cleanup_buffer buffer; struct _condvar_cleanup_buffer cbuffer; @@ -501,7 +510,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); err = __futex_abstimed_wait_cancelable64 ( - cond->__data.__g_signals + g, 0, clockid, abstime, private); + cond->__data.__g_signals + g, signals, clockid, abstime, private); __pthread_cleanup_pop (&buffer, 0); @@ -524,6 +533,8 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, signals = atomic_load_acquire (cond->__data.__g_signals + g); } + if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) + goto done; } /* Try to grab a signal. Use acquire MO so that we see an up-to-date value of __g1_start below (see spinning above for a similar case). In @@ -532,69 +543,6 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, &signals, signals - 2)); - /* We consumed a signal but we could have consumed from a more recent group - that aliased with ours due to being in the same group slot. If this - might be the case our group must be closed as visible through - __g1_start. */ - uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); - if (seq < (g1_start >> 1)) - { - /* We potentially stole a signal from a more recent group but we do not - know which group we really consumed from. - We do not care about groups older than current G1 because they are - closed; we could have stolen from these, but then we just add a - spurious wake-up for the current groups. - We will never steal a signal from current G2 that was really intended - for G2 because G2 never receives signals (until it becomes G1). We - could have stolen a signal from G2 that was conservatively added by a - previous waiter that also thought it stole a signal -- but given that - that signal was added unnecessarily, it's not a problem if we steal - it. - Thus, the remaining case is that we could have stolen from the current - G1, where "current" means the __g1_start value we observed. However, - if the current G1 does not have the same slot index as we do, we did - not steal from it and do not need to undo that. This is the reason - for putting a bit with G2's index into__g1_start as well. */ - if (((g1_start & 1) ^ 1) == g) - { - /* We have to conservatively undo our potential mistake of stealing - a signal. We can stop trying to do that when the current G1 - changes because other spinning waiters will notice this too and - __condvar_quiesce_and_switch_g1 has checked that there are no - futex waiters anymore before switching G1. - Relaxed MO is fine for the __g1_start load because we need to - merely be able to observe this fact and not have to observe - something else as well. - ??? Would it help to spin for a little while to see whether the - current G1 gets closed? This might be worthwhile if the group is - small or close to being closed. */ - unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g); - while (__condvar_load_g1_start_relaxed (cond) == g1_start) - { - /* Try to add a signal. We don't need to acquire the lock - because at worst we can cause a spurious wake-up. If the - group is in the process of being closed (LSB is true), this - has an effect similar to us adding a signal. */ - if (((s & 1) != 0) - || atomic_compare_exchange_weak_relaxed - (cond->__data.__g_signals + g, &s, s + 2)) - { - /* If we added a signal, we also need to add a wake-up on - the futex. We also need to do that if we skipped adding - a signal because the group is being closed because - while __condvar_quiesce_and_switch_g1 could have closed - the group, it might still be waiting for futex waiters to - leave (and one of those waiters might be the one we stole - the signal from, which cause it to block using the - futex). */ - futex_wake (cond->__data.__g_signals + g, 1, private); - break; - } - /* TODO Back off. */ - } - } - } - done: /* Confirm that we have been woken. We do that before acquiring the mutex