| /* |
| * Copyright (c) 2008-2013 Apple Inc. All rights reserved. |
| * |
| * @APPLE_APACHE_LICENSE_HEADER_START@ |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| * @APPLE_APACHE_LICENSE_HEADER_END@ |
| */ |
| |
| #include "internal.h" |
| |
| #undef dispatch_once |
| #undef dispatch_once_f |
| |
| |
| typedef struct _dispatch_once_waiter_s { |
| volatile struct _dispatch_once_waiter_s *volatile dow_next; |
| dispatch_thread_event_s dow_event; |
| mach_port_t dow_thread; |
| } *_dispatch_once_waiter_t; |
| |
| #define DISPATCH_ONCE_DONE ((_dispatch_once_waiter_t)~0l) |
| |
| #ifdef __BLOCKS__ |
| void |
| dispatch_once(dispatch_once_t *val, dispatch_block_t block) |
| { |
| dispatch_once_f(val, block, _dispatch_Block_invoke(block)); |
| } |
| #endif |
| |
| DISPATCH_NOINLINE |
| void |
| dispatch_once_f(dispatch_once_t *val, void *ctxt, dispatch_function_t func) |
| { |
| #if DISPATCH_GATE_USE_FOR_DISPATCH_ONCE |
| dispatch_once_gate_t l = (dispatch_once_gate_t)val; |
| |
| if (_dispatch_once_gate_tryenter(l)) { |
| _dispatch_client_callout(ctxt, func); |
| _dispatch_once_gate_broadcast(l); |
| } else { |
| _dispatch_once_gate_wait(l); |
| } |
| #else |
| _dispatch_once_waiter_t volatile *vval = (_dispatch_once_waiter_t*)val; |
| struct _dispatch_once_waiter_s dow = { }; |
| _dispatch_once_waiter_t tail = &dow, next, tmp; |
| dispatch_thread_event_t event; |
| |
| if (os_atomic_cmpxchg(vval, NULL, tail, acquire)) { |
| dow.dow_thread = _dispatch_tid_self(); |
| _dispatch_client_callout(ctxt, func); |
| |
| // The next barrier must be long and strong. |
| // |
| // The scenario: SMP systems with weakly ordered memory models |
| // and aggressive out-of-order instruction execution. |
| // |
| // The problem: |
| // |
| // The dispatch_once*() wrapper macro causes the callee's |
| // instruction stream to look like this (pseudo-RISC): |
| // |
| // load r5, pred-addr |
| // cmpi r5, -1 |
| // beq 1f |
| // call dispatch_once*() |
| // 1f: |
| // load r6, data-addr |
| // |
| // May be re-ordered like so: |
| // |
| // load r6, data-addr |
| // load r5, pred-addr |
| // cmpi r5, -1 |
| // beq 1f |
| // call dispatch_once*() |
| // 1f: |
| // |
| // Normally, a barrier on the read side is used to workaround |
| // the weakly ordered memory model. But barriers are expensive |
| // and we only need to synchronize once! After func(ctxt) |
| // completes, the predicate will be marked as "done" and the |
| // branch predictor will correctly skip the call to |
| // dispatch_once*(). |
| // |
| // A far faster alternative solution: Defeat the speculative |
| // read-ahead of peer CPUs. |
| // |
| // Modern architectures will throw away speculative results |
| // once a branch mis-prediction occurs. Therefore, if we can |
| // ensure that the predicate is not marked as being complete |
| // until long after the last store by func(ctxt), then we have |
| // defeated the read-ahead of peer CPUs. |
| // |
| // In other words, the last "store" by func(ctxt) must complete |
| // and then N cycles must elapse before ~0l is stored to *val. |
| // The value of N is whatever is sufficient to defeat the |
| // read-ahead mechanism of peer CPUs. |
| // |
| // On some CPUs, the most fully synchronizing instruction might |
| // need to be issued. |
| |
| os_atomic_maximally_synchronizing_barrier(); |
| // above assumed to contain release barrier |
| next = os_atomic_xchg(vval, DISPATCH_ONCE_DONE, relaxed); |
| while (next != tail) { |
| _dispatch_wait_until(tmp = (_dispatch_once_waiter_t)next->dow_next); |
| event = &next->dow_event; |
| next = tmp; |
| _dispatch_thread_event_signal(event); |
| } |
| } else { |
| _dispatch_thread_event_init(&dow.dow_event); |
| next = *vval; |
| for (;;) { |
| if (next == DISPATCH_ONCE_DONE) { |
| break; |
| } |
| if (os_atomic_cmpxchgvw(vval, next, tail, &next, release)) { |
| dow.dow_thread = next->dow_thread; |
| dow.dow_next = next; |
| if (dow.dow_thread) { |
| pthread_priority_t pp = _dispatch_get_priority(); |
| _dispatch_thread_override_start(dow.dow_thread, pp, val); |
| } |
| _dispatch_thread_event_wait(&dow.dow_event); |
| if (dow.dow_thread) { |
| _dispatch_thread_override_end(dow.dow_thread, val); |
| } |
| break; |
| } |
| } |
| _dispatch_thread_event_destroy(&dow.dow_event); |
| } |
| #endif |
| } |