blob: d243700079bae1b4f32c320c81fc9aee07775602 [file] [log] [blame]
/*
* kmp_runtime.c -- KPTS runtime support library
* $Revision: 42839 $
* $Date: 2013-11-24 13:01:00 -0600 (Sun, 24 Nov 2013) $
*/
//===----------------------------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
#include "kmp.h"
#include "kmp_atomic.h"
#include "kmp_wrapper_getpid.h"
#include "kmp_environment.h"
#include "kmp_itt.h"
#include "kmp_str.h"
#include "kmp_settings.h"
#include "kmp_i18n.h"
#include "kmp_io.h"
#include "kmp_error.h"
/* these are temporary issues to be dealt with */
#define KMP_USE_PRCTL 0
#define KMP_USE_POOLED_ALLOC 0
#if KMP_MIC
#include <immintrin.h>
#define USE_NGO_STORES 1
#endif // KMP_MIC
#if KMP_MIC && USE_NGO_STORES
#define load_icvs(src) __m512d Vt_icvs = _mm512_load_pd((void *)(src))
#define store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt_icvs)
#define sync_icvs() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
#else
#define load_icvs(src) ((void)0)
#define store_icvs(dst, src) copy_icvs((dst), (src))
#define sync_icvs() ((void)0)
#endif /* KMP_MIC && USE_NGO_STORES */
#if KMP_OS_WINDOWS
#include <process.h>
#endif
#if defined(KMP_GOMP_COMPAT)
char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
#endif /* defined(KMP_GOMP_COMPAT) */
char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
#if OMP_40_ENABLED
"4.0 (201307)";
#elif OMP_30_ENABLED
"3.1 (201107)";
#else
"2.5 (200505)";
#endif
#ifdef KMP_DEBUG
char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
char const __kmp_version_perf_v19[] = KMP_VERSION_PREFIX "perf v19: "
#if KMP_PERF_V19 == KMP_ON
"on";
#elif KMP_PERF_V19 == KMP_OFF
"off";
#else
#error "Must specify KMP_PERF_V19 option"
#endif
char const __kmp_version_perf_v106[] = KMP_VERSION_PREFIX "perf v106: "
#if KMP_PERF_V106 == KMP_ON
"on";
#elif KMP_PERF_V106 == KMP_OFF
"off";
#else
#error "Must specify KMP_PERF_V106 option"
#endif
#endif /* KMP_DEBUG */
#define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
kmp_info_t __kmp_monitor;
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* Forward declarations */
void __kmp_cleanup( void );
static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
static void __kmp_initialize_team(
kmp_team_t * team,
int new_nproc,
#if OMP_30_ENABLED
kmp_internal_control_t * new_icvs,
ident_t * loc
#else
int new_set_nproc, int new_set_dynamic, int new_set_nested,
int new_set_blocktime, int new_bt_intervals, int new_bt_set
#endif // OMP_30_ENABLED
);
static void __kmp_partition_places( kmp_team_t *team );
static void __kmp_do_serial_initialize( void );
#ifdef USE_LOAD_BALANCE
static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
#endif
static int __kmp_expand_threads(int nWish, int nNeed);
static int __kmp_unregister_root_other_thread( int gtid );
static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* Calculate the identifier of the current thread */
/* fast (and somewhat portable) way to get unique */
/* identifier of executing thread. */
/* returns KMP_GTID_DNE if we haven't been assigned a gtid */
int
__kmp_get_global_thread_id( )
{
int i;
kmp_info_t **other_threads;
size_t stack_data;
char *stack_addr;
size_t stack_size;
char *stack_base;
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
__kmp_nth, __kmp_all_nth ));
/* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
parallel region, made it return KMP_GTID_DNE to force serial_initialize by
caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
__kmp_init_gtid for this to work. */
if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
#ifdef KMP_TDATA_GTID
if ( TCR_4(__kmp_gtid_mode) >= 3) {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
return __kmp_gtid;
}
#endif
if ( TCR_4(__kmp_gtid_mode) >= 2) {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
return __kmp_gtid_get_specific();
}
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
stack_addr = (char*) & stack_data;
other_threads = __kmp_threads;
/*
ATT: The code below is a source of potential bugs due to unsynchronized access to
__kmp_threads array. For example:
1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
2. Current thread is suspended by OS.
3. Another thread unregisters and finishes (debug versions of free() may fill memory
with something like 0xEF).
4. Current thread is resumed.
5. Current thread reads junk from *thr.
TODO: Fix it.
--ln
*/
for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
if( !thr ) continue;
stack_size = (size_t)TCR_PTR(thr -> th.th_info.ds.ds_stacksize);
stack_base = (char *)TCR_PTR(thr -> th.th_info.ds.ds_stackbase);
/* stack grows down -- search through all of the active threads */
if( stack_addr <= stack_base ) {
size_t stack_diff = stack_base - stack_addr;
if( stack_diff <= stack_size ) {
/* The only way we can be closer than the allocated */
/* stack size is if we are running on this thread. */
KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
return i;
}
}
}
/* get specific to try and determine our gtid */
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
"thread, using TLS\n" ));
i = __kmp_gtid_get_specific();
/*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
/* if we havn't been assigned a gtid, then return code */
if( i<0 ) return i;
/* dynamically updated stack window for uber threads to avoid get_specific call */
if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
KMP_FATAL( StackOverflow, i );
}
stack_base = (char *) other_threads[i] -> th.th_info.ds.ds_stackbase;
if( stack_addr > stack_base ) {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
} else {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
}
/* Reprint stack bounds for ubermaster since they have been refined */
if ( __kmp_storage_map ) {
char *stack_end = (char *) other_threads[i] -> th.th_info.ds.ds_stackbase;
char *stack_beg = stack_end - other_threads[i] -> th.th_info.ds.ds_stacksize;
__kmp_print_storage_map_gtid( i, stack_beg, stack_end,
other_threads[i] -> th.th_info.ds.ds_stacksize,
"th_%d stack (refinement)", i );
}
return i;
}
int
__kmp_get_global_thread_id_reg( )
{
int gtid;
if ( !__kmp_init_serial ) {
gtid = KMP_GTID_DNE;
} else
#ifdef KMP_TDATA_GTID
if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
gtid = __kmp_gtid;
} else
#endif
if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
gtid = __kmp_gtid_get_specific();
} else {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
gtid = __kmp_get_global_thread_id();
}
/* we must be a new uber master sibling thread */
if( gtid == KMP_GTID_DNE ) {
KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
"Registering a new gtid.\n" ));
__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
if( !__kmp_init_serial ) {
__kmp_do_serial_initialize();
gtid = __kmp_gtid_get_specific();
} else {
gtid = __kmp_register_root(FALSE);
}
__kmp_release_bootstrap_lock( &__kmp_initz_lock );
/*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
}
KMP_DEBUG_ASSERT( gtid >=0 );
return gtid;
}
/* caller must hold forkjoin_lock */
void
__kmp_check_stack_overlap( kmp_info_t *th )
{
int f;
char *stack_beg = NULL;
char *stack_end = NULL;
int gtid;
KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
if ( __kmp_storage_map ) {
stack_end = (char *) th -> th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th -> th.th_info.ds.ds_stacksize;
gtid = __kmp_gtid_from_thread( th );
if (gtid == KMP_GTID_MONITOR) {
__kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%s stack (%s)", "mon",
( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
} else {
__kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%d stack (%s)", gtid,
( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
}
}
/* No point in checking ubermaster threads since they use refinement and cannot overlap */
if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid = __kmp_gtid_from_thread( th )))
{
KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
if ( stack_beg == NULL ) {
stack_end = (char *) th -> th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th -> th.th_info.ds.ds_stacksize;
}
for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
if( f_th && f_th != th ) {
char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
char *other_stack_beg = other_stack_end -
(size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
(stack_end > other_stack_beg && stack_end < other_stack_end)) {
/* Print the other stack values before the abort */
if ( __kmp_storage_map )
__kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
(size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
"th_%d stack (overlapped)",
__kmp_gtid_from_thread( f_th ) );
__kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
}
}
}
}
KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
}
/* ------------------------------------------------------------------------ */
#ifndef KMP_DEBUG
# define __kmp_static_delay( arg ) /* nothing to do */
#else
static void
__kmp_static_delay( int arg )
{
/* Work around weird code-gen bug that causes assert to trip */
# if KMP_ARCH_X86_64 && KMP_OS_LINUX
KMP_ASSERT( arg != 0 );
# else
KMP_ASSERT( arg >= 0 );
# endif
}
#endif /* KMP_DEBUG */
static void
__kmp_static_yield( int arg )
{
__kmp_yield( arg );
}
/*
* Spin wait loop that first does pause, then yield, then sleep.
* Wait until spinner is equal to checker to exit.
*
* A thread that calls __kmp_wait_sleep must make certain that another thread
* calls __kmp_release to wake it back up up to prevent deadlocks!
*/
void
__kmp_wait_sleep( kmp_info_t *this_thr,
volatile kmp_uint *spinner,
kmp_uint checker,
int final_spin
USE_ITT_BUILD_ARG (void * itt_sync_obj)
)
{
/* note: we may not belong to a team at this point */
register volatile kmp_uint *spin = spinner;
register kmp_uint check = checker;
register kmp_uint32 spins;
register kmp_uint32 hibernate;
int th_gtid, th_tid;
#if OMP_30_ENABLED
int flag = FALSE;
#endif /* OMP_30_ENABLED */
KMP_FSYNC_SPIN_INIT( spin, NULL );
if( TCR_4(*spin) == check ) {
KMP_FSYNC_SPIN_ACQUIRED( spin );
return;
}
th_gtid = this_thr->th.th_info.ds.ds_gtid;
KA_TRACE( 20, ("__kmp_wait_sleep: T#%d waiting for spin(%p) == %d\n",
th_gtid,
spin, check ) );
/* setup for waiting */
KMP_INIT_YIELD( spins );
if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
//
// The worker threads cannot rely on the team struct existing at this
// point. Use the bt values cached in the thread struct instead.
//
#ifdef KMP_ADJUST_BLOCKTIME
if ( __kmp_zero_bt && ! this_thr->th.th_team_bt_set ) {
/* force immediate suspend if not set by user and more threads than available procs */
hibernate = 0;
} else {
hibernate = this_thr->th.th_team_bt_intervals;
}
#else
hibernate = this_thr->th.th_team_bt_intervals;
#endif /* KMP_ADJUST_BLOCKTIME */
//
// If the blocktime is nonzero, we want to make sure that we spin
// wait for the entirety of the specified #intervals, plus up to
// one interval more. This increment make certain that this thread
// doesn't go to sleep too soon.
//
if ( hibernate != 0 ) {
hibernate++;
}
//
// Add in the current time value.
//
hibernate += TCR_4( __kmp_global.g.g_time.dt.t_value );
KF_TRACE( 20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
hibernate - __kmp_global.g.g_time.dt.t_value ));
}
KMP_MB();
/* main wait spin loop */
while( TCR_4(*spin) != check ) {
int in_pool;
#if OMP_30_ENABLED
//
// If the task team is NULL, it means one of things:
// 1) A newly-created thread is first being released by
// __kmp_fork_barrier(), and its task team has not been set up
// yet.
// 2) All tasks have been executed to completion, this thread has
// decremented the task team's ref ct and possibly deallocated
// it, and should no longer reference it.
// 3) Tasking is off for this region. This could be because we
// are in a serialized region (perhaps the outer one), or else
// tasking was manually disabled (KMP_TASKING=0).
//
kmp_task_team_t * task_team = NULL;
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
task_team = this_thr->th.th_task_team;
if ( task_team != NULL ) {
if ( ! TCR_SYNC_4( task_team->tt.tt_active ) ) {
KMP_DEBUG_ASSERT( ! KMP_MASTER_TID( this_thr->th.th_info.ds.ds_tid ) );
__kmp_unref_task_team( task_team, this_thr );
} else if ( KMP_TASKING_ENABLED( task_team, this_thr->th.th_task_state ) ) {
__kmp_execute_tasks( this_thr, th_gtid, spin, check, final_spin, &flag
USE_ITT_BUILD_ARG( itt_sync_obj ), 0);
}
}; // if
}; // if
#endif /* OMP_30_ENABLED */
KMP_FSYNC_SPIN_PREPARE( spin );
if( TCR_4(__kmp_global.g.g_done) ) {
if( __kmp_global.g.g_abort )
__kmp_abort_thread( );
break;
}
__kmp_static_delay( 1 );
/* if we are oversubscribed,
or have waited a bit (and KMP_LIBRARY=throughput), then yield */
KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
// TODO: Should it be number of cores instead of thread contexts? Like:
// KMP_YIELD( TCR_4(__kmp_nth) > __kmp_ncores );
// Need performance improvement data to make the change...
KMP_YIELD_SPIN( spins );
//
// Check if this thread was transferred from a team
// to the thread pool (or vice-versa) while spinning.
//
in_pool = !!TCR_4(this_thr->th.th_in_pool);
if ( in_pool != !!this_thr->th.th_active_in_pool ) {
if ( in_pool ) {
//
// recently transferred from team to pool
//
KMP_TEST_THEN_INC32(
(kmp_int32 *) &__kmp_thread_pool_active_nth );
this_thr->th.th_active_in_pool = TRUE;
//
// Here, we cannot assert that
//
// KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth)
// <= __kmp_thread_pool_nth );
//
// __kmp_thread_pool_nth is inc/dec'd by the master thread
// while the fork/join lock is held, whereas
// __kmp_thread_pool_active_nth is inc/dec'd asynchronously
// by the workers. The two can get out of sync for brief
// periods of time.
//
}
else {
//
// recently transferred from pool to team
//
KMP_TEST_THEN_DEC32(
(kmp_int32 *) &__kmp_thread_pool_active_nth );
KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
this_thr->th.th_active_in_pool = FALSE;
}
}
#if OMP_30_ENABLED
// Don't suspend if there is a likelihood of new tasks being spawned.
if ( ( task_team != NULL ) && TCR_4(task_team->tt.tt_found_tasks) ) {
continue;
}
#endif /* OMP_30_ENABLED */
/* Don't suspend if KMP_BLOCKTIME is set to "infinite" */
if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
continue;
}
/* if we have waited a bit more, fall asleep */
if ( TCR_4( __kmp_global.g.g_time.dt.t_value ) < hibernate ) {
continue;
}
KF_TRACE( 50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid ) );
__kmp_suspend( th_gtid, spin, check );
if( TCR_4( __kmp_global.g.g_done ) && __kmp_global.g.g_abort ) {
__kmp_abort_thread( );
}
/* TODO */
/* if thread is done with work and timesout, disband/free */
}
KMP_FSYNC_SPIN_ACQUIRED( spin );
}
/*
* Release the thread specified by target_thr from waiting by setting the location
* specified by spin and resume the thread if indicated by the sleep parameter.
*
* A thread that calls __kmp_wait_sleep must call this function to wake up the
* potentially sleeping thread and prevent deadlocks!
*/
void
__kmp_release( kmp_info_t *target_thr, volatile kmp_uint *spin,
enum kmp_mem_fence_type fetchadd_fence )
{
kmp_uint old_spin;
#ifdef KMP_DEBUG
int target_gtid = target_thr->th.th_info.ds.ds_gtid;
int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
#endif
KF_TRACE( 20, ( "__kmp_release: T#%d releasing T#%d spin(%p) fence_type(%d)\n",
gtid, target_gtid, spin, fetchadd_fence ));
KMP_DEBUG_ASSERT( spin );
KMP_DEBUG_ASSERT( fetchadd_fence == kmp_acquire_fence ||
fetchadd_fence == kmp_release_fence );
KMP_FSYNC_RELEASING( spin );
old_spin = ( fetchadd_fence == kmp_acquire_fence )
? KMP_TEST_THEN_ADD4_ACQ32( (volatile kmp_int32 *) spin )
: KMP_TEST_THEN_ADD4_32( (volatile kmp_int32 *) spin );
KF_TRACE( 100, ( "__kmp_release: T#%d old spin(%p)=%d, set new spin=%d\n",
gtid, spin, old_spin, *spin ) );
if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
/* Only need to check sleep stuff if infinite block time not set */
if ( old_spin & KMP_BARRIER_SLEEP_STATE ) {
#ifndef KMP_DEBUG
int target_gtid = target_thr->th.th_info.ds.ds_gtid;
#endif
/* wake up thread if needed */
KF_TRACE( 50, ( "__kmp_release: T#%d waking up thread T#%d since sleep spin(%p) set\n",
gtid, target_gtid, spin ));
__kmp_resume( target_gtid, spin );
} else {
KF_TRACE( 50, ( "__kmp_release: T#%d don't wake up thread T#%d since sleep spin(%p) not set\n",
gtid, target_gtid, spin ));
}
}
}
/* ------------------------------------------------------------------------ */
void
__kmp_infinite_loop( void )
{
static int done = FALSE;
while (! done) {
KMP_YIELD( 1 );
}
}
#define MAX_MESSAGE 512
void
__kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
char buffer[MAX_MESSAGE];
int node;
va_list ap;
va_start( ap, format);
sprintf( buffer, "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
__kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
__kmp_vprintf( kmp_err, buffer, ap );
#if KMP_PRINT_DATA_PLACEMENT
if(gtid >= 0) {
if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
if( __kmp_storage_map_verbose ) {
node = __kmp_get_host_node(p1);
if(node < 0) /* doesn't work, so don't try this next time */
__kmp_storage_map_verbose = FALSE;
else {
char *last;
int lastNode;
int localProc = __kmp_get_cpu_from_gtid(gtid);
p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
if(localProc >= 0)
__kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
else
__kmp_printf_no_lock(" GTID %d\n", gtid);
# if KMP_USE_PRCTL
/* The more elaborate format is disabled for now because of the prctl hanging bug. */
do {
last = p1;
lastNode = node;
/* This loop collates adjacent pages with the same host node. */
do {
(char*)p1 += PAGE_SIZE;
} while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
__kmp_printf_no_lock(" %p-%p memNode %d\n", last,
(char*)p1 - 1, lastNode);
} while(p1 <= p2);
# else
__kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
(char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
if(p1 < p2) {
__kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
(char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
}
# endif
}
}
} else
__kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
}
#endif /* KMP_PRINT_DATA_PLACEMENT */
__kmp_release_bootstrap_lock( & __kmp_stdio_lock );
}
void
__kmp_warn( char const * format, ... )
{
char buffer[MAX_MESSAGE];
va_list ap;
if ( __kmp_generate_warnings == kmp_warnings_off ) {
return;
}
va_start( ap, format );
snprintf( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
__kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
__kmp_vprintf( kmp_err, buffer, ap );
__kmp_release_bootstrap_lock( & __kmp_stdio_lock );
va_end( ap );
}
void
__kmp_abort_process()
{
// Later threads may stall here, but that's ok because abort() will kill them.
__kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
if ( __kmp_debug_buf ) {
__kmp_dump_debug_buffer();
}; // if
if ( KMP_OS_WINDOWS ) {
// Let other threads know of abnormal termination and prevent deadlock
// if abort happened during library initialization or shutdown
__kmp_global.g.g_abort = SIGABRT;
/*
On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
works well, but this function is not available in VS7 (this is not problem for DLL, but
it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
not help, at least in some versions of MS C RTL.
It seems following sequence is the only way to simulate abort() and avoid pop-up error
box.
*/
raise( SIGABRT );
_exit( 3 ); // Just in case, if signal ignored, exit anyway.
} else {
abort();
}; // if
__kmp_infinite_loop();
__kmp_release_bootstrap_lock( & __kmp_exit_lock );
} // __kmp_abort_process
void
__kmp_abort_thread( void )
{
// TODO: Eliminate g_abort global variable and this function.
// In case of abort just call abort(), it will kill all the threads.
__kmp_infinite_loop();
} // __kmp_abort_thread
/* ------------------------------------------------------------------------ */
/*
* Print out the storage map for the major kmp_info_t thread data structures
* that are allocated together.
*/
static void
__kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
{
__kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
__kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
"th_%d.th_info", gtid );
__kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
"th_%d.th_local", gtid );
__kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
__kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
&thr->th.th_bar[bs_plain_barrier+1],
sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
__kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
&thr->th.th_bar[bs_forkjoin_barrier+1],
sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
&thr->th.th_bar[bs_reduction_barrier+1],
sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
#endif // KMP_FAST_REDUCTION_BARRIER
}
/*
* Print out the storage map for the major kmp_team_t team data structures
* that are allocated together.
*/
static void
__kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
{
int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
__kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
#endif // KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
header, team_id );
/*
__kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
#if OMP_30_ENABLED
//__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
// sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
#endif // OMP_30_ENABLED
#if OMP_40_ENABLED
__kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
#endif
*/
__kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
}
static void __kmp_init_allocator() {}
static void __kmp_fini_allocator() {}
static void __kmp_fini_allocator_thread() {}
/* ------------------------------------------------------------------------ */
#ifdef GUIDEDLL_EXPORTS
# if KMP_OS_WINDOWS
static void
__kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
// TODO: Change to __kmp_break_bootstrap_lock().
__kmp_init_bootstrap_lock( lck ); // make the lock released
}
static void
__kmp_reset_locks_on_process_detach( int gtid_req ) {
int i;
int thread_count;
// PROCESS_DETACH is expected to be called by a thread
// that executes ProcessExit() or FreeLibrary().
// OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
// So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
// However, in fact, some threads can be still alive here, although being about to be terminated.
// The threads in the array with ds_thread==0 are most suspicious.
// Actually, it can be not safe to access the __kmp_threads[].
// TODO: does it make sense to check __kmp_roots[] ?
// Let's check that there are no other alive threads registered with the OMP lib.
while( 1 ) {
thread_count = 0;
for( i = 0; i < __kmp_threads_capacity; ++i ) {
if( !__kmp_threads ) continue;
kmp_info_t* th = __kmp_threads[ i ];
if( th == NULL ) continue;
int gtid = th->th.th_info.ds.ds_gtid;
if( gtid == gtid_req ) continue;
if( gtid < 0 ) continue;
DWORD exit_val;
int alive = __kmp_is_thread_alive( th, &exit_val );
if( alive ) {
++thread_count;
}
}
if( thread_count == 0 ) break; // success
}
// Assume that I'm alone.
// Now it might be probably safe to check and reset locks.
// __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
__kmp_reset_lock( &__kmp_forkjoin_lock );
#ifdef KMP_DEBUG
__kmp_reset_lock( &__kmp_stdio_lock );
#endif // KMP_DEBUG
}
BOOL WINAPI
DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
//__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
switch( fdwReason ) {
case DLL_PROCESS_ATTACH:
KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
return TRUE;
case DLL_PROCESS_DETACH:
KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
__kmp_gtid_get_specific() ));
if( lpReserved != NULL )
{
// lpReserved is used for telling the difference:
// lpReserved == NULL when FreeLibrary() was called,
// lpReserved != NULL when the process terminates.
// When FreeLibrary() is called, worker threads remain alive.
// So they will release the forkjoin lock by themselves.
// When the process terminates, worker threads disappear triggering
// the problem of unreleased forkjoin lock as described below.
// A worker thread can take the forkjoin lock
// in __kmp_suspend()->__kmp_rml_decrease_load_before_sleep().
// The problem comes up if that worker thread becomes dead
// before it releases the forkjoin lock.
// The forkjoin lock remains taken, while the thread
// executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
// will try to take the forkjoin lock and will always fail,
// so that the application will never finish [normally].
// This scenario is possible if __kmpc_end() has not been executed.
// It looks like it's not a corner case, but common cases:
// - the main function was compiled by an alternative compiler;
// - the main function was compiled by icl but without /Qopenmp (application with plugins);
// - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
// - alive foreign thread prevented __kmpc_end from doing cleanup.
// This is a hack to work around the problem.
// TODO: !!! to figure out something better.
__kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
}
__kmp_internal_end_library( __kmp_gtid_get_specific() );
return TRUE;
case DLL_THREAD_ATTACH:
KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
/* if we wanted to register new siblings all the time here call
* __kmp_get_gtid(); */
return TRUE;
case DLL_THREAD_DETACH:
KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
__kmp_gtid_get_specific() ));
__kmp_internal_end_thread( __kmp_gtid_get_specific() );
return TRUE;
}
return TRUE;
}
# endif /* KMP_OS_WINDOWS */
#endif /* GUIDEDLL_EXPORTS */
/* ------------------------------------------------------------------------ */
/* Change the library type to "status" and return the old type */
/* called from within initialization routines where __kmp_initz_lock is held */
int
__kmp_change_library( int status )
{
int old_status;
old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
if (status) {
__kmp_yield_init |= 1; // throughput => turnaround (odd init count)
}
else {
__kmp_yield_init &= ~1; // turnaround => throughput (even init count)
}
return old_status; // return previous setting of whether KMP_LIBRARY=throughput
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* __kmp_parallel_deo --
* Wait until it's our turn.
*/
void
__kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
{
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
kmp_team_t *team = __kmp_team_from_gtid( gtid );
#endif /* BUILD_PARALLEL_ORDERED */
if( __kmp_env_consistency_check ) {
if( __kmp_threads[gtid] -> th.th_root -> r.r_active )
__kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
}
#ifdef BUILD_PARALLEL_ORDERED
if( !team -> t.t_serialized ) {
kmp_uint32 spins;
KMP_MB();
KMP_WAIT_YIELD(&team -> t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
KMP_MB();
}
#endif /* BUILD_PARALLEL_ORDERED */
}
/* __kmp_parallel_dxo --
* Signal the next task.
*/
void
__kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
{
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
int tid = __kmp_tid_from_gtid( gtid );
kmp_team_t *team = __kmp_team_from_gtid( gtid );
#endif /* BUILD_PARALLEL_ORDERED */
if( __kmp_env_consistency_check ) {
if( __kmp_threads[gtid] -> th.th_root -> r.r_active )
__kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
}
#ifdef BUILD_PARALLEL_ORDERED
if ( ! team -> t.t_serialized ) {
KMP_MB(); /* Flush all pending memory write invalidates. */
/* use the tid of the next thread in this team */
/* TODO repleace with general release procedure */
team -> t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
KMP_MB(); /* Flush all pending memory write invalidates. */
}
#endif /* BUILD_PARALLEL_ORDERED */
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* The BARRIER for a SINGLE process section is always explicit */
int
__kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
{
int status;
kmp_info_t *th;
kmp_team_t *team;
if( ! TCR_4(__kmp_init_parallel) )
__kmp_parallel_initialize();
th = __kmp_threads[ gtid ];
team = th -> th.th_team;
status = 0;
th->th.th_ident = id_ref;
if ( team -> t.t_serialized ) {
status = 1;
} else {
kmp_int32 old_this = th->th.th_local.this_construct;
++th->th.th_local.this_construct;
/* try to set team count to thread count--success means thread got the
single block
*/
/* TODO: Should this be acquire or release? */
status = KMP_COMPARE_AND_STORE_ACQ32(&team -> t.t_construct, old_this,
th->th.th_local.this_construct);
}
if( __kmp_env_consistency_check ) {
if (status && push_ws) {
__kmp_push_workshare( gtid, ct_psingle, id_ref );
} else {
__kmp_check_workshare( gtid, ct_psingle, id_ref );
}
}
#if USE_ITT_BUILD
if ( status ) {
__kmp_itt_single_start( gtid );
}
#endif /* USE_ITT_BUILD */
return status;
}
void
__kmp_exit_single( int gtid )
{
#if USE_ITT_BUILD
__kmp_itt_single_end( gtid );
#endif /* USE_ITT_BUILD */
if( __kmp_env_consistency_check )
__kmp_pop_workshare( gtid, ct_psingle, NULL );
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
static void
__kmp_linear_barrier_gather( enum barrier_type bt,
kmp_info_t *this_thr,
int gtid,
int tid,
void (*reduce)(void *, void *)
USE_ITT_BUILD_ARG(void * itt_sync_obj)
)
{
register kmp_team_t *team = this_thr -> th.th_team;
register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
register kmp_info_t **other_threads = team -> t.t_threads;
KA_TRACE( 20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
/*
* We now perform a linear reduction to signal that all
* of the threads have arrived.
*
* Collect all the worker team member threads.
*/
if ( ! KMP_MASTER_TID( tid )) {
KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
"arrived(%p): %u => %u\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
&thr_bar -> b_arrived, thr_bar -> b_arrived,
thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
) );
/* mark arrival to master thread */
//
// After performing this write, a worker thread may not assume that
// the team is valid any more - it could be deallocated by the master
// thread at any time.
//
__kmp_release( other_threads[0], &thr_bar -> b_arrived, kmp_release_fence );
} else {
register kmp_balign_team_t *team_bar = & team -> t.t_bar[ bt ];
register int nproc = this_thr -> th.th_team_nproc;
register int i;
/* Don't have to worry about sleep bit here or atomic since team setting */
register kmp_uint new_state = team_bar -> b_arrived + KMP_BARRIER_STATE_BUMP;
/* Collect all the worker team member threads. */
for (i = 1; i < nproc; i++) {
#if KMP_CACHE_MANAGE
/* prefetch next thread's arrived count */
if ( i+1 < nproc )
KMP_CACHE_PREFETCH( &other_threads[ i+1 ] -> th.th_bar[ bt ].bb.b_arrived );
#endif /* KMP_CACHE_MANAGE */
KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
"arrived(%p) == %u\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( i, team ), team->t.t_id, i,
&other_threads[i] -> th.th_bar[ bt ].bb.b_arrived,
new_state ) );
/* wait for worker thread to arrive */
__kmp_wait_sleep( this_thr,
& other_threads[ i ] -> th.th_bar[ bt ].bb.b_arrived,
new_state, FALSE
USE_ITT_BUILD_ARG( itt_sync_obj )
);
if (reduce) {
KA_TRACE( 100, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( i, team ), team->t.t_id, i ) );
(*reduce)( this_thr -> th.th_local.reduce_data,
other_threads[ i ] -> th.th_local.reduce_data );
}
}
/* Don't have to worry about sleep bit here or atomic since team setting */
team_bar -> b_arrived = new_state;
KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
"arrived(%p) = %u\n",
gtid, team->t.t_id, tid, team->t.t_id,
&team_bar -> b_arrived, new_state ) );
}
KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
}
static void
__kmp_tree_barrier_gather( enum barrier_type bt,
kmp_info_t *this_thr,
int gtid,
int tid,
void (*reduce) (void *, void *)
USE_ITT_BUILD_ARG( void * itt_sync_obj )
)
{
register kmp_team_t *team = this_thr -> th.th_team;
register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
register kmp_info_t **other_threads = team -> t.t_threads;
register kmp_uint32 nproc = this_thr -> th.th_team_nproc;
register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[ bt ];
register kmp_uint32 branch_factor = 1 << branch_bits ;
register kmp_uint32 child;
register kmp_uint32 child_tid;
register kmp_uint new_state;
KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
/*
* We now perform a tree gather to wait until all
* of the threads have arrived, and reduce any required data
* as we go.
*/
child_tid = (tid << branch_bits) + 1;
if ( child_tid < nproc ) {
/* parent threads wait for all their children to arrive */
new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
child = 1;
do {
register kmp_info_t *child_thr = other_threads[ child_tid ];
register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
#if KMP_CACHE_MANAGE
/* prefetch next thread's arrived count */
if ( child+1 <= branch_factor && child_tid+1 < nproc )
KMP_CACHE_PREFETCH( &other_threads[ child_tid+1 ] -> th.th_bar[ bt ].bb.b_arrived );
#endif /* KMP_CACHE_MANAGE */
KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
"arrived(%p) == %u\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( child_tid, team ), team->t.t_id, child_tid,
&child_bar -> b_arrived, new_state ) );
/* wait for child to arrive */
__kmp_wait_sleep( this_thr, &child_bar -> b_arrived, new_state, FALSE
USE_ITT_BUILD_ARG( itt_sync_obj)
);
if (reduce) {
KA_TRACE( 100, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
child_tid ) );
(*reduce)( this_thr -> th.th_local.reduce_data,
child_thr -> th.th_local.reduce_data );
}
child++;
child_tid++;
}
while ( child <= branch_factor && child_tid < nproc );
}
if ( !KMP_MASTER_TID(tid) ) {
/* worker threads */
register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
"arrived(%p): %u => %u\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( parent_tid, team ), team->t.t_id, parent_tid,
&thr_bar -> b_arrived, thr_bar -> b_arrived,
thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
) );
/* mark arrival to parent thread */
//
// After performing this write, a worker thread may not assume that
// the team is valid any more - it could be deallocated by the master
// thread at any time.
//
__kmp_release( other_threads[parent_tid], &thr_bar -> b_arrived, kmp_release_fence );
} else {
/* Need to update the team arrived pointer if we are the master thread */
if ( nproc > 1 )
/* New value was already computed above */
team -> t.t_bar[ bt ].b_arrived = new_state;
else
team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
gtid, team->t.t_id, tid, team->t.t_id,
&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived ) );
}
KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
}
static void
__kmp_hyper_barrier_gather( enum barrier_type bt,
kmp_info_t *this_thr,
int gtid,
int tid,
void (*reduce) (void *, void *)
USE_ITT_BUILD_ARG (void * itt_sync_obj)
)
{
register kmp_team_t *team = this_thr -> th.th_team;
register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
register kmp_info_t **other_threads = team -> t.t_threads;
register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
register kmp_uint32 num_threads = this_thr -> th.th_team_nproc;
register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[ bt ];
register kmp_uint32 branch_factor = 1 << branch_bits ;
register kmp_uint32 offset;
register kmp_uint32 level;
KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
#if USE_ITT_BUILD && USE_ITT_NOTIFY
// Barrier imbalance - save arrive time to the thread
if( __kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3 ) {
this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
}
#endif
/*
* We now perform a hypercube-embedded tree gather to wait until all
* of the threads have arrived, and reduce any required data
* as we go.
*/
for ( level=0, offset =1;
offset < num_threads;
level += branch_bits, offset <<= branch_bits )
{
register kmp_uint32 child;
register kmp_uint32 child_tid;
if ( ((tid >> level) & (branch_factor - 1)) != 0 ) {
register kmp_int32 parent_tid = tid & ~( (1 << (level + branch_bits)) -1 );
KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
"arrived(%p): %u => %u\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( parent_tid, team ), team->t.t_id, parent_tid,
&thr_bar -> b_arrived, thr_bar -> b_arrived,
thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
) );
/* mark arrival to parent thread */
//
// After performing this write (in the last iteration of the
// enclosing for loop), a worker thread may not assume that the
// team is valid any more - it could be deallocated by the master
// thread at any time.
//
__kmp_release( other_threads[parent_tid], &thr_bar -> b_arrived, kmp_release_fence );
break;
}
/* parent threads wait for children to arrive */
if (new_state == KMP_BARRIER_UNUSED_STATE)
new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
for ( child = 1, child_tid = tid + (1 << level);
child < branch_factor && child_tid < num_threads;
child++, child_tid += (1 << level) )
{
register kmp_info_t *child_thr = other_threads[ child_tid ];
register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
#if KMP_CACHE_MANAGE
register kmp_uint32 next_child_tid = child_tid + (1 << level);
/* prefetch next thread's arrived count */
if ( child+1 < branch_factor && next_child_tid < num_threads )
KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ] -> th.th_bar[ bt ].bb.b_arrived );
#endif /* KMP_CACHE_MANAGE */
KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
"arrived(%p) == %u\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( child_tid, team ), team->t.t_id, child_tid,
&child_bar -> b_arrived, new_state ) );
/* wait for child to arrive */
__kmp_wait_sleep( this_thr, &child_bar -> b_arrived, new_state, FALSE
USE_ITT_BUILD_ARG (itt_sync_obj)
);
#if USE_ITT_BUILD
// Barrier imbalance - write min of the thread time and a child time to the thread.
if( __kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3 ) {
this_thr->th.th_bar_arrive_time = KMP_MIN( this_thr->th.th_bar_arrive_time, child_thr->th.th_bar_arrive_time );
}
#endif
if (reduce) {
KA_TRACE( 100, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
child_tid ) );
(*reduce)( this_thr -> th.th_local.reduce_data,
child_thr -> th.th_local.reduce_data );
}
}
}
if ( KMP_MASTER_TID(tid) ) {
/* Need to update the team arrived pointer if we are the master thread */
if (new_state == KMP_BARRIER_UNUSED_STATE)
team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
else
team -> t.t_bar[ bt ].b_arrived = new_state;
KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
gtid, team->t.t_id, tid, team->t.t_id,
&team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived ) );
}
KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
}
static void
__kmp_linear_barrier_release( enum barrier_type bt,
kmp_info_t *this_thr,
int gtid,
int tid,
int propagate_icvs
USE_ITT_BUILD_ARG(void * itt_sync_obj)
)
{
register kmp_bstate_t *thr_bar = &this_thr -> th.th_bar[ bt ].bb;
register kmp_team_t *team;
if (KMP_MASTER_TID( tid )) {
register unsigned int i;
register kmp_uint32 nproc = this_thr -> th.th_team_nproc;
register kmp_info_t **other_threads;
team = __kmp_threads[ gtid ]-> th.th_team;
KMP_DEBUG_ASSERT( team != NULL );
other_threads = team -> t.t_threads;
KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
if (nproc > 1) {
#if KMP_BARRIER_ICV_PUSH
if ( propagate_icvs ) {
load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
for (i = 1; i < nproc; i++) {
__kmp_init_implicit_task( team->t.t_ident,
team->t.t_threads[i], team, i, FALSE );
store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
}
sync_icvs();
}
#endif // KMP_BARRIER_ICV_PUSH
/* Now, release all of the worker threads */
for (i = 1; i < nproc; i++) {
#if KMP_CACHE_MANAGE
/* prefetch next thread's go flag */
if( i+1 < nproc )
KMP_CACHE_PREFETCH( &other_threads[ i+1 ]-> th.th_bar[ bt ].bb.b_go );
#endif /* KMP_CACHE_MANAGE */
KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
"go(%p): %u => %u\n",
gtid, team->t.t_id, tid,
other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
&other_threads[i]->th.th_bar[bt].bb.b_go,
other_threads[i]->th.th_bar[bt].bb.b_go,
other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP
) );
__kmp_release( other_threads[ i ],
&other_threads[ i ]-> th.th_bar[ bt ].bb.b_go, kmp_acquire_fence );
}
}
} else {
/* Wait for the MASTER thread to release us */
KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
__kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
USE_ITT_BUILD_ARG(itt_sync_obj)
);
#if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
// we are on a fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
// cancel wait on previous parallel region...
__kmp_itt_task_starting( itt_sync_obj );
if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
return;
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
if ( itt_sync_obj != NULL )
__kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
} else
#endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
//
// early exit for reaping threads releasing forkjoin barrier
//
if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
return;
//
// The worker thread may now assume that the team is valid.
//
#if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
// libguide only code (cannot use *itt_task* routines)
if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
// we are on a fork barrier where we could not get the object reliably
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
__kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
}
#endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
#ifdef KMP_DEBUG
tid = __kmp_tid_from_gtid( gtid );
team = __kmp_threads[ gtid ]-> th.th_team;
#endif
KMP_DEBUG_ASSERT( team != NULL );
TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
KA_TRACE( 20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
KMP_MB(); /* Flush all pending memory write invalidates. */
}
KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
}
static void
__kmp_tree_barrier_release( enum barrier_type bt,
kmp_info_t *this_thr,
int gtid,
int tid,
int propagate_icvs
USE_ITT_BUILD_ARG(void * itt_sync_obj)
)
{
/* handle fork barrier workers who aren't part of a team yet */
register kmp_team_t *team;
register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
register kmp_uint32 nproc;
register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
register kmp_uint32 branch_factor = 1 << branch_bits ;
register kmp_uint32 child;
register kmp_uint32 child_tid;
/*
* We now perform a tree release for all
* of the threads that have been gathered
*/
if ( ! KMP_MASTER_TID( tid )) {
/* worker threads */
KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
/* wait for parent thread to release us */
__kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
USE_ITT_BUILD_ARG(itt_sync_obj)
);
#if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
// we are on a fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
// cancel wait on previous parallel region...
__kmp_itt_task_starting( itt_sync_obj );
if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
return;
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
if ( itt_sync_obj != NULL )
__kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
} else
#endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
//
// early exit for reaping threads releasing forkjoin barrier
//
if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
return;
//
// The worker thread may now assume that the team is valid.
//
#if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
// libguide only code (cannot use *itt_task* routines)
if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
// we are on a fork barrier where we could not get the object reliably
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
__kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
}
#endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
team = __kmp_threads[ gtid ]-> th.th_team;
KMP_DEBUG_ASSERT( team != NULL );
tid = __kmp_tid_from_gtid( gtid );
TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
KMP_MB(); /* Flush all pending memory write invalidates. */
} else {
team = __kmp_threads[ gtid ]-> th.th_team;
KMP_DEBUG_ASSERT( team != NULL );
KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
}
nproc = this_thr -> th.th_team_nproc;
child_tid = ( tid << branch_bits ) + 1;
if ( child_tid < nproc ) {
register kmp_info_t **other_threads = team -> t.t_threads;
child = 1;
/* parent threads release all their children */
do {
register kmp_info_t *child_thr = other_threads[ child_tid ];
register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
#if KMP_CACHE_MANAGE
/* prefetch next thread's go count */
if ( child+1 <= branch_factor && child_tid+1 < nproc )
KMP_CACHE_PREFETCH( &other_threads[ child_tid+1 ] -> th.th_bar[ bt ].bb.b_go );
#endif /* KMP_CACHE_MANAGE */
#if KMP_BARRIER_ICV_PUSH
if ( propagate_icvs ) {
__kmp_init_implicit_task( team->t.t_ident,
team->t.t_threads[child_tid], team, child_tid, FALSE );
load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
sync_icvs();
}
#endif // KMP_BARRIER_ICV_PUSH
KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
"go(%p): %u => %u\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
child_tid, &child_bar -> b_go, child_bar -> b_go,
child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
/* release child from barrier */
__kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
child++;
child_tid++;
}
while ( child <= branch_factor && child_tid < nproc );
}
KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
}
/* The reverse versions seem to beat the forward versions overall */
#define KMP_REVERSE_HYPER_BAR
static void
__kmp_hyper_barrier_release( enum barrier_type bt,
kmp_info_t *this_thr,
int gtid,
int tid,
int propagate_icvs
USE_ITT_BUILD_ARG(void * itt_sync_obj)
)
{
/* handle fork barrier workers who aren't part of a team yet */
register kmp_team_t *team;
register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
register kmp_info_t **other_threads;
register kmp_uint32 num_threads;
register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
register kmp_uint32 branch_factor = 1 << branch_bits;
register kmp_uint32 child;
register kmp_uint32 child_tid;
register kmp_uint32 offset;
register kmp_uint32 level;
/* Perform a hypercube-embedded tree release for all of the threads
that have been gathered. If KMP_REVERSE_HYPER_BAR is defined (default)
the threads are released in the reverse order of the corresponding gather,
otherwise threads are released in the same order. */
if ( ! KMP_MASTER_TID( tid )) {
/* worker threads */
KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
/* wait for parent thread to release us */
__kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
USE_ITT_BUILD_ARG( itt_sync_obj )
);
#if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
// we are on a fork barrier where we could not get the object reliably
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
// cancel wait on previous parallel region...
__kmp_itt_task_starting( itt_sync_obj );
if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
return;
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
if ( itt_sync_obj != NULL )
__kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
} else
#endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
//
// early exit for reaping threads releasing forkjoin barrier
//
if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
return;
//
// The worker thread may now assume that the team is valid.
//
#if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
// libguide only code (cannot use *itt_task* routines)
if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
// we are on a fork barrier where we could not get the object reliably
itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
__kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
}
#endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
team = __kmp_threads[ gtid ]-> th.th_team;
KMP_DEBUG_ASSERT( team != NULL );
tid = __kmp_tid_from_gtid( gtid );
TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
KMP_MB(); /* Flush all pending memory write invalidates. */
} else { /* KMP_MASTER_TID(tid) */
team = __kmp_threads[ gtid ]-> th.th_team;
KMP_DEBUG_ASSERT( team != NULL );
KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
}
num_threads = this_thr -> th.th_team_nproc;
other_threads = team -> t.t_threads;
#ifdef KMP_REVERSE_HYPER_BAR
/* count up to correct level for parent */
for ( level = 0, offset = 1;
offset < num_threads && (((tid >> level) & (branch_factor-1)) == 0);
level += branch_bits, offset <<= branch_bits );
/* now go down from there */
for ( level -= branch_bits, offset >>= branch_bits;
offset != 0;
level -= branch_bits, offset >>= branch_bits )
#else
/* Go down the tree, level by level */
for ( level = 0, offset = 1;
offset < num_threads;
level += branch_bits, offset <<= branch_bits )
#endif // KMP_REVERSE_HYPER_BAR
{
#ifdef KMP_REVERSE_HYPER_BAR
/* Now go in reverse order through the children, highest to lowest.
Initial setting of child is conservative here. */
child = num_threads >> ((level==0)?level:level-1);
for ( child = (child < branch_factor-1) ? child : branch_factor-1,
child_tid = tid + (child << level);
child >= 1;
child--, child_tid -= (1 << level) )
#else
if (((tid >> level) & (branch_factor - 1)) != 0)
/* No need to go any lower than this, since this is the level
parent would be notified */
break;
/* iterate through children on this level of the tree */
for ( child = 1, child_tid = tid + (1 << level);
child < branch_factor && child_tid < num_threads;
child++, child_tid += (1 << level) )
#endif // KMP_REVERSE_HYPER_BAR
{
if ( child_tid >= num_threads ) continue; /* child doesn't exist so keep going */
else {
register kmp_info_t *child_thr = other_threads[ child_tid ];
register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
#if KMP_CACHE_MANAGE
register kmp_uint32 next_child_tid = child_tid - (1 << level);
/* prefetch next thread's go count */
#ifdef KMP_REVERSE_HYPER_BAR
if ( child-1 >= 1 && next_child_tid < num_threads )
#else
if ( child+1 < branch_factor && next_child_tid < num_threads )
#endif // KMP_REVERSE_HYPER_BAR
KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go );
#endif /* KMP_CACHE_MANAGE */
#if KMP_BARRIER_ICV_PUSH
if ( propagate_icvs ) {
KMP_DEBUG_ASSERT( team != NULL );
__kmp_init_implicit_task( team->t.t_ident,
team->t.t_threads[child_tid], team, child_tid, FALSE );
load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
sync_icvs();
}
#endif // KMP_BARRIER_ICV_PUSH
KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
"go(%p): %u => %u\n",
gtid, team->t.t_id, tid,
__kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
child_tid, &child_bar -> b_go, child_bar -> b_go,
child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
/* release child from barrier */
__kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
}
}
}
KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
gtid, team->t.t_id, tid, bt ) );
}
/*
* Internal function to do a barrier.
* If is_split is true, do a split barrier, otherwise, do a plain barrier
* If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
* Returns 0 if master thread, 1 if worker thread.
*/
int
__kmp_barrier( enum barrier_type bt, int gtid, int is_split,
size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *) )
{
register int tid = __kmp_tid_from_gtid( gtid );
register kmp_info_t *this_thr = __kmp_threads[ gtid ];
register kmp_team_t *team = this_thr -> th.th_team;
register int status = 0;
ident_t * tmp_loc = __kmp_threads[ gtid ]->th.th_ident;
KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) has arrived\n",
gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
if ( ! team->t.t_serialized ) {
#if USE_ITT_BUILD
// This value will be used in itt notify events below.
void * itt_sync_obj = NULL;
#if USE_ITT_NOTIFY
if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
itt_sync_obj = __kmp_itt_barrier_object( gtid, bt, 1 );
#endif
#endif /* USE_ITT_BUILD */
#if OMP_30_ENABLED
if ( __kmp_tasking_mode == tskm_extra_barrier ) {
__kmp_tasking_barrier( team, this_thr, gtid );
KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
}
#endif /* OMP_30_ENABLED */
//
// Copy the blocktime info to the thread, where __kmp_wait_sleep()
// can access it when the team struct is not guaranteed to exist.
//
// See the note about the corresponding code in __kmp_join_barrier()
// being performance-critical.
//
if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
#if OMP_30_ENABLED
this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
#else
this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
#endif // OMP_30_ENABLED
}
#if USE_ITT_BUILD
if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
__kmp_itt_barrier_starting( gtid, itt_sync_obj );
#endif /* USE_ITT_BUILD */
if ( reduce != NULL ) {
//KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
this_thr -> th.th_local.reduce_data = reduce_data;
}
if ( __kmp_barrier_gather_pattern[ bt ] == bp_linear_bar || __kmp_barrier_gather_branch_bits[ bt ] == 0 ) {
__kmp_linear_barrier_gather( bt, this_thr, gtid, tid, reduce
USE_ITT_BUILD_ARG( itt_sync_obj )
);
} else if ( __kmp_barrier_gather_pattern[ bt ] == bp_tree_bar ) {
__kmp_tree_barrier_gather( bt, this_thr, gtid, tid, reduce
USE_ITT_BUILD_ARG( itt_sync_obj )
);
} else {
__kmp_hyper_barrier_gather( bt, this_thr, gtid, tid, reduce
USE_ITT_BUILD_ARG( itt_sync_obj )
);
}; // if
#if USE_ITT_BUILD
// TODO: In case of split reduction barrier, master thread may send acquired event early,
// before the final summation into the shared variable is done (final summation can be a
// long operation for array reductions).
if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
__kmp_itt_barrier_middle( gtid, itt_sync_obj );
#endif /* USE_ITT_BUILD */
KMP_MB();
if ( KMP_MASTER_TID( tid ) ) {
status = 0;
#if OMP_30_ENABLED
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
__kmp_task_team_wait( this_thr, team
USE_ITT_BUILD_ARG( itt_sync_obj )
);
__kmp_task_team_setup( this_thr, team );
}
#endif /* OMP_30_ENABLED */
#if USE_ITT_BUILD && USE_ITT_NOTIFY
// Barrier - report frame end
if( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode ) {
kmp_uint64 tmp = __itt_get_timestamp();
switch( __kmp_forkjoin_frames_mode ) {
case 1:
__kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, tmp_loc );
this_thr->th.th_frame_time = tmp;
break;
case 2:
__kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, tmp_loc );
break;
case 3:
__kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, tmp_loc );
__kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, tmp_loc );
this_thr->th.th_frame_time = tmp;
break;
}
}
#endif /* USE_ITT_BUILD */
} else {
status = 1;
}
if ( status == 1 || ! is_split ) {
if ( __kmp_barrier_release_pattern[ bt ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bt ] == 0 ) {
__kmp_linear_barrier_release( bt, this_thr, gtid, tid, FALSE
USE_ITT_BUILD_ARG( itt_sync_obj )
);
} else if ( __kmp_barrier_release_pattern[ bt ] == bp_tree_bar ) {
__kmp_tree_barrier_release( bt, this_thr, gtid, tid, FALSE
USE_ITT_BUILD_ARG( itt_sync_obj )
);
} else {
__kmp_hyper_barrier_release( bt, this_thr, gtid, tid, FALSE
USE_ITT_BUILD_ARG( itt_sync_obj )
);
}
#if OMP_30_ENABLED
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
__kmp_task_team_sync( this_thr, team );
}
#endif /* OMP_30_ENABLED */
}
#if USE_ITT_BUILD
// GEH: TODO: Move this under if-condition above and also include in __kmp_end_split_barrier().
// This will more accurately represent the actual release time of the threads for split barriers.
if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
__kmp_itt_barrier_finished( gtid, itt_sync_obj );
#endif /* USE_ITT_BUILD */
} else { // Team is serialized.
status = 0;
#if OMP_30_ENABLED
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
//
// The task team should be NULL for serialized code.
// (tasks will be executed immediately).
//
KMP_DEBUG_ASSERT( team->t.t_task_team == NULL );
KMP_DEBUG_ASSERT( this_thr->th.th_task_team == NULL );
}
#endif /* OMP_30_ENABLED */
}
KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid),
status ) );
return status;
}
void
__kmp_end_split_barrier( enum barrier_type bt, int gtid )
{
int tid = __kmp_tid_from_gtid( gtid );
kmp_info_t *this_thr = __kmp_threads[ gtid ];
kmp_team_t *team = this_thr -> th.th_team;
if( ! team -> t.t_serialized ) {
if( KMP_MASTER_GTID( gtid ) ) {
if ( __kmp_barrier_release_pattern[ bt ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bt ] == 0 ) {
__kmp_linear_barrier_release( bt, this_thr, gtid, tid, FALSE
#if USE_ITT_BUILD
, NULL
#endif /* USE_ITT_BUILD */
);
} else if ( __kmp_barrier_release_pattern[ bt ] == bp_tree_bar ) {
__kmp_tree_barrier_release( bt, this_thr, gtid, tid, FALSE
#if USE_ITT_BUILD
, NULL
#endif /* USE_ITT_BUILD */
);
} else {
__kmp_hyper_barrier_release( bt, this_thr, gtid, tid, FALSE
#if USE_ITT_BUILD
, NULL
#endif /* USE_ITT_BUILD */
);
}; // if
#if OMP_30_ENABLED
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
__kmp_task_team_sync( this_thr, team );
}; // if
#endif /* OMP_30_ENABLED */
}
}
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/*
* determine if we can go parallel or must use a serialized parallel region and
* how many threads we can use
* set_nproc is the number of threads requested for the team
* returns 0 if we should serialize or only use one thread,
* otherwise the number of threads to use
* The forkjoin lock is held by the caller.
*/
static int
__kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
int master_tid, int set_nthreads
#if OMP_40_ENABLED
, int enter_teams
#endif /* OMP_40_ENABLED */
)
{
int capacity;
int new_nthreads;
int use_rml_to_adjust_nth;
KMP_DEBUG_ASSERT( __kmp_init_serial );
KMP_DEBUG_ASSERT( root && parent_team );
//
// Initial check to see if we should use a serialized team.
//
if ( set_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
__kmp_get_gtid(), set_nthreads ));
return 1;
}
if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
#if OMP_40_ENABLED
&& !enter_teams
#endif /* OMP_40_ENABLED */
) ) || ( __kmp_library == library_serial ) ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
__kmp_get_gtid(), set_nthreads ));
return 1;
}
//
// If dyn-var is set, dynamically adjust the number of desired threads,
// according to the method specified by dynamic_mode.
//
new_nthreads = set_nthreads;
use_rml_to_adjust_nth = FALSE;
if ( ! get__dynamic_2( parent_team, master_tid ) ) {
;
}
#ifdef USE_LOAD_BALANCE
else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
if ( new_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
master_tid ));
return 1;
}
if ( new_nthreads < set_nthreads ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
master_tid, new_nthreads ));
}
}
#endif /* USE_LOAD_BALANCE */
else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
: root->r.r_hot_team->t.t_nproc);
if ( new_nthreads <= 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
master_tid ));
return 1;
}
if ( new_nthreads < set_nthreads ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
master_tid, new_nthreads ));
}
else {
new_nthreads = set_nthreads;
}
}
else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
if ( set_nthreads > 2 ) {
new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
new_nthreads = ( new_nthreads % set_nthreads ) + 1;
if ( new_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
master_tid ));
return 1;
}
if ( new_nthreads < set_nthreads ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
master_tid, new_nthreads ));
}
}
}
else {
KMP_ASSERT( 0 );
}
//
// Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
//
if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
root->r.r_hot_team->t.t_nproc );
if ( tl_nthreads <= 0 ) {
tl_nthreads = 1;
}
//
// If dyn-var is false, emit a 1-time warning.
//
if ( ! get__dynamic_2( parent_team, master_tid )
&& ( ! __kmp_reserve_warn ) ) {
__kmp_reserve_warn = 1;
__kmp_msg(
kmp_ms_warning,
KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
KMP_HNT( Unset_ALL_THREADS ),
__kmp_msg_null
);
}
if ( tl_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
master_tid ));
return 1;
}
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
master_tid, tl_nthreads ));
new_nthreads = tl_nthreads;
}
//
// Check if the threads array is large enough, or needs expanding.
//
// See comment in __kmp_register_root() about the adjustment if
// __kmp_threads[0] == NULL.
//
capacity = __kmp_threads_capacity;
if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
--capacity;
}
if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
root->r.r_hot_team->t.t_nproc ) > capacity ) {
//
// Expand the threads array.
//
int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
root->r.r_hot_team->t.t_nproc ) - capacity;
int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
if ( slotsAdded < slotsRequired ) {
//
// The threads array was not expanded enough.
//
new_nthreads -= ( slotsRequired - slotsAdded );
KMP_ASSERT( new_nthreads >= 1 );
//
// If dyn-var is false, emit a 1-time warning.
//
if ( ! get__dynamic_2( parent_team, master_tid )
&& ( ! __kmp_reserve_warn ) ) {
__kmp_reserve_warn = 1;
if ( __kmp_tp_cached ) {
__kmp_msg(
kmp_ms_warning,
KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
KMP_HNT( PossibleSystemLimitOnThreads ),
__kmp_msg_null
);
}
else {
__kmp_msg(
kmp_ms_warning,
KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
KMP_HNT( SystemLimitOnThreads ),
__kmp_msg_null
);
}
}
}
}
if ( new_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
__kmp_get_gtid(), set_nthreads ) );
return 1;
}
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
__kmp_get_gtid(), new_nthreads, set_nthreads ));
return new_nthreads;
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* allocate threads from the thread pool and assign them to the new team */
/* we are assured that there are enough threads available, because we
* checked on that earlier within critical section forkjoin */
static void
__kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
kmp_info_t *master_th, int master_gtid )
{
int i;
KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
KMP_MB();
/* first, let's setup the master thread */
master_th -> th.th_info.ds.ds_tid = 0;
master_th -> th.th_team = team;
master_th -> th.th_team_nproc = team -> t.t_nproc;
master_th -> th.th_team_master = master_th;
master_th -> th.th_team_serialized = FALSE;
master_th -> th.th_dispatch = & team -> t.t_dispatch[ 0 ];
/* make sure we are not the optimized hot team */
if ( team != root->r.r_hot_team ) {
/* install the master thread */
team -> t.t_threads[ 0 ] = master_th;
__kmp_initialize_info( master_th, team, 0, master_gtid );
/* now, install the worker threads */
for ( i=1 ; i < team->t.t_nproc ; i++ ) {
/* fork or reallocate a new thread and install it in team */
team -> t.t_threads[ i ] = __kmp_allocate_thread( root, team, i );
KMP_DEBUG_ASSERT( team->t.t_threads[i] );
KMP_DEBUG_ASSERT( team->t.t_threads[i]->th.th_team == team );
/* align team and thread arrived states */
KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
__kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
__kmp_gtid_from_tid( i, team ), team->t.t_id, i,
team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
{ // Initialize threads' barrier data.
int b;
kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
for ( b = 0; b < bs_last_barrier; ++ b ) {
balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
}; // for b
}
}
#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
__kmp_partition_places( team );
#endif
}
KMP_MB();
}
static void
__kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
static void
__kmp_setup_icv_copy( kmp_team_t *team, int new_nproc,
#if OMP_30_ENABLED
kmp_internal_control_t * new_icvs,
ident_t * loc
#else
int new_set_nproc, int new_set_dynamic, int new_set_nested,
int new_set_blocktime, int new_bt_intervals, int new_bt_set
#endif // OMP_30_ENABLED
); // forward declaration
/* most of the work for a fork */
/* return true if we really went parallel, false if serialized */
int
__kmp_fork_call(
ident_t * loc,
int gtid,
int exec_master, // 0 - GNU native code, master doesn't invoke microtask
// 1 - Intel code, master invokes microtask
// 2 - MS native code, use special invoker
kmp_int32 argc,
microtask_t microtask,
launch_t invoker,
/* TODO: revert workaround for Intel(R) 64 tracker #96 */
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
va_list * ap
#else
va_list ap
#endif
)
{
void **argv;
int i;
int master_tid;
int master_this_cons;
int master_last_cons;
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int nthreads;
int master_active;
int master_set_numthreads;
int level;
#if OMP_40_ENABLED
int teams_level;
#endif
KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
/* initialize if needed */
KMP_DEBUG_ASSERT( __kmp_init_serial );
if( ! TCR_4(__kmp_init_parallel) )
__kmp_parallel_initialize();
/* setup current data */
master_th = __kmp_threads[ gtid ];
parent_team = master_th -> th.th_team;
master_tid = master_th -> th.th_info.ds.ds_tid;
master_this_cons = master_th -> th.th_local.this_construct;
master_last_cons = master_th -> th.th_local.last_construct;
root = master_th -> th.th_root;
master_active = root -> r.r_active;
master_set_numthreads = master_th -> th.th_set_nproc;
#if OMP_30_ENABLED
// Nested level will be an index in the nested nthreads array
level = parent_team->t.t_level;
#endif // OMP_30_ENABLED
#if OMP_40_ENABLED
teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
#endif
master_th->th.th_ident = loc;
#if OMP_40_ENABLED
if ( master_th->th.th_team_microtask &&
ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
// AC: This is start of parallel that is nested inside teams construct.
// The team is actual (hot), all workers are ready at the fork barrier.
// No lock needed to initialize the team a bit, then free workers.
parent_team->t.t_ident = loc;
parent_team->t.t_argc = argc;
argv = (void**)parent_team->t.t_argv;
for( i=argc-1; i >= 0; --i )
/* TODO: revert workaround for Intel(R) 64 tracker #96 */
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
*argv++ = va_arg( *ap, void * );
#else
*argv++ = va_arg( ap, void * );
#endif
/* Increment our nested depth levels, but not increase the serialization */
if ( parent_team == master_th->th.th_serial_team ) {
// AC: we are in serialized parallel
__kmpc_serialized_parallel(loc, gtid);
KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
parent_team->t.t_serialized--; // AC: need this in order enquiry functions
// work correctly, will restore at join time
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
return TRUE;
}
parent_team->t.t_pkfn = microtask;
parent_team->t.t_invoke = invoker;
KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
parent_team->t.t_active_level ++;
parent_team->t.t_level ++;
/* Change number of threads in the team if requested */
if ( master_set_numthreads ) { // The parallel has num_threads clause
if ( master_set_numthreads < master_th->th.th_set_nth_teams ) {
// AC: only can reduce the number of threads dynamically, cannot increase
kmp_info_t **other_threads = parent_team->t.t_threads;
parent_team->t.t_nproc = master_set_numthreads;
for ( i = 0; i < master_set_numthreads; ++i ) {
other_threads[i]->th.th_team_nproc = master_set_numthreads;
}
// Keep extra threads hot in the team for possible next parallels
}
master_th->th.th_set_nproc = 0;
}
KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
__kmp_internal_fork( loc, gtid, parent_team );
KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
/* Invoke microtask for MASTER thread */
KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
if (! parent_team->t.t_invoke( gtid )) {
KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
}
KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
return TRUE;
}
#endif /* OMP_40_ENABLED */
#if OMP_30_ENABLED && KMP_DEBUG
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
}
#endif // OMP_30_ENABLED
/* determine how many new threads we can use */
__kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
#if OMP_30_ENABLED
if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
nthreads = 1;
}
else
#endif // OMP_30_ENABLED
{
nthreads = master_set_numthreads ?
master_set_numthreads : get__nproc_2( parent_team, master_tid );
nthreads = __kmp_reserve_threads( root, parent_team, master_tid, nthreads
#if OMP_40_ENABLED
// AC: If we execute teams from parallel region (on host), then teams
// should be created but each can only have 1 thread if nesting is disabled.
// If teams called from serial region, then teams and their threads
// should be created regardless of the nesting setting.
,( ( ap == NULL && teams_level == 0 ) ||
( ap && teams_level > 0 && teams_level == level ) )
#endif /* OMP_40_ENABLED */
);
}
KMP_DEBUG_ASSERT( nthreads > 0 );
/* If we temporarily changed the set number of threads then restore it now */
master_th -> th.th_set_nproc = 0;
/* create a serialized parallel region? */
if ( nthreads == 1 ) {
/* josh todo: hypothetical question: what do we do for OS X*? */
#if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM )
void * args[ argc ];
#else
void * * args = (void**) alloca( argc * sizeof( void * ) );
#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM ) */
__kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
__kmpc_serialized_parallel(loc, gtid);
if ( exec_master == 0 ) {
// we were called from GNU native code
KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
return FALSE;
} else if ( exec_master == 1 ) {
/* TODO this sucks, use the compiler itself to pass args! :) */
master_th -> th.th_serial_team -> t.t_ident = loc;
#if OMP_40_ENABLED
if ( !ap ) {
// revert change made in __kmpc_serialized_parallel()
master_th -> th.th_serial_team -> t.t_level--;
// Get args from parent team for teams construct
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
} else if ( microtask == (microtask_t)__kmp_teams_master ) {
KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
team = master_th->th.th_team;
//team->t.t_pkfn = microtask;
team->t.t_invoke = invoker;
__kmp_alloc_argv_entries( argc, team, TRUE );
team->t.t_argc = argc;
argv = (void**) team->t.t_argv;
if ( ap ) {
for( i=argc-1; i >= 0; --i )
/* TODO: revert workaround for Intel(R) 64 tracker #96 */
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
*argv++ = va_arg( *ap, void * );
#else
*argv++ = va_arg( ap, void * );
#endif
} else {
for( i=0; i < argc; ++i )
// Get args from parent team for teams construct
argv[i] = parent_team->t.t_argv[i];
}
// AC: revert change made in __kmpc_serialized_parallel()
// because initial code in teams should have level=0