blob: 4f6b30471a533097a3c9cd410044044f0943bfd4 [file] [log] [blame]
#include"SimpleMOC_header.h"
#ifdef PAPI
// initialize papi with one thread first
void papi_serial_init(void)
{
if ( PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT){
fprintf(stderr, "PAPI library init error!\n");
exit(1);
}
if (( PAPI_thread_init((long unsigned int (*)(void))
pthread_self )) != PAPI_OK){
PAPI_perror("PAPI_thread_init");
exit(1);
}
}
void counter_init( int *eventset, int *num_papi_events, Input * I )
{
char error_str[PAPI_MAX_STR_LEN];
int stat;
int * events;
// Command line event
if( I->papi_event_set == -1){
*num_papi_events = 1;
events = (int *) malloc( *num_papi_events * sizeof(int));
PAPI_event_name_to_code( I->event_name, &events[0]);
}
// FLOPS
if( I->papi_event_set == 0 )
{
*num_papi_events = 2;
events = (int *) malloc( *num_papi_events * sizeof(int));
events[0] = PAPI_SP_OPS;
events[1] = PAPI_TOT_CYC;
}
// Bandwidth
if( I->papi_event_set == 1 )
{
*num_papi_events = 2;
events = (int *) malloc( *num_papi_events * sizeof(int));
events[0] = PAPI_L3_TCM;
events[1] = PAPI_TOT_CYC;
}
// CPU Stall Reason
if( I->papi_event_set == 2 )
{
*num_papi_events = 4;
events = (int *) malloc( *num_papi_events * sizeof(int));
int EventCode;
char * event1 = "RESOURCE_STALLS:ANY";
char * event2 = "RESOURCE_STALLS:SB";
char * event3 = "RESOURCE_STALLS:RS";
char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
PAPI_event_name_to_code( event3, &EventCode );
events[2] = EventCode;
PAPI_event_name_to_code( event4, &EventCode );
events[3] = EventCode;
}
// CPU Stall Percentage
if( I->papi_event_set == 3 )
{
*num_papi_events = 2;
events = (int *) malloc( *num_papi_events * sizeof(int));
int EventCode;
char * event1 = "RESOURCE_STALLS:ANY";
char * event2 = "PAPI_TOT_CYC";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
}
// Memory Loads
if( I->papi_event_set == 4 )
{
*num_papi_events = 4;
events = (int *) malloc( *num_papi_events * sizeof(int));
int EventCode;
char * event1 = "MEM_LOAD_UOPS_RETIRED";
char * event2 = "MEM_LOAD_UOPS_RETIRED:L1_HIT";
char * event3 = "MEM_LOAD_UOPS_RETIRED:L2_HIT";
char * event4 = "MEM_LOAD_UOPS_RETIRED:L3_HIT";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
PAPI_event_name_to_code( event3, &EventCode );
events[2] = EventCode;
PAPI_event_name_to_code( event4, &EventCode );
events[3] = EventCode;
}
// LLC Miss Rate
if( I->papi_event_set == 5 )
{
*num_papi_events = 2;
events = (int *) malloc( *num_papi_events * sizeof(int));
events[0] = PAPI_L3_TCM;
events[1] = PAPI_L3_TCA;
}
// Branch MisPrediction
if( I->papi_event_set == 6 )
{
*num_papi_events = 3;
events = (int *) malloc( *num_papi_events * sizeof(int));
events[0] = PAPI_BR_MSP;
events[1] = PAPI_BR_CN;
events[2] = PAPI_BR_PRC;
}
// TLB Misses
if( I->papi_event_set == 7 )
{
*num_papi_events = 4;
events = (int *) malloc( *num_papi_events * sizeof(int));
int EventCode;
char * event1 = "perf::DTLB-LOADS";
char * event2 = "perf::DTLB-LOAD-MISSES";
char * event3 = "perf::DTLB-STORES";
char * event4 = "perf::DTLB-STORE-MISSES";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
PAPI_event_name_to_code( event3, &EventCode );
events[2] = EventCode;
PAPI_event_name_to_code( event4, &EventCode );
events[3] = EventCode;
}
/////////////////////////////////////////////////////////////////////////
// PAPI EVENT SELECTION
/////////////////////////////////////////////////////////////////////////
// User can comment/uncomment blocks as they see fit within this seciton
// Some Standard Events
//int events[] = {PAPI_TOT_INS,PAPI_LD_INS,PAPI_FP_INS};
// Bandwidth Used
// ((PAPI_Lx_TCM * Lx_linesize) / PAPI_TOT_CYC) * Clock(MHz)
//int events[] = {PAPI_L3_TCM, PAPI_TOT_CYC};
// L3 Total Cache Miss Ratio
// PAPI_L3_TCM / PAPI_L3_TCA
// (On Xeon dual octo - 65%, not dependent on # of threads)
//int events[] = {PAPI_L3_TCM, PAPI_L3_TCA};
// % Cycles with no instruction use
// PAPI_STL_ICY / PAPI_TOT_CYC
//int events[] = { PAPI_STL_ICY, PAPI_TOT_CYC };
// % Branch instructions Mispredicted
// PAPI_BR_MSP / PAPI_BR_CN
//int events[] = { PAPI_BR_MSP, PAPI_BR_CN, PAPI_BR_PRC };
// TLB Misses
//int events[] = { PAPI_TLB_DM };
// MFlops
// (PAPI_FP_INS/PAPI_TOT_CYC) * Clock(MHz)
//int events[] = { PAPI_FP_INS, PAPI_TOT_CYC };
// MFlops (Alternate?)
// (PAPI_FP_INS/PAPI_TOT_CYC) * Clock(MHz)
//int events[] = { PAPI_DP_OPS, PAPI_TOT_CYC };
// TLB misses (Using native counters)
/*
int events[2];
int EventCode;
char * event1 = "perf::DTLB-LOADS";
char * event2 = "perf::DTLB-LOAD-MISSES";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
*/
/*
// Stalled Cycles, front v back (Using native counters)
int events[3];
int EventCode;
char * event1 = "perf::STALLED-CYCLES-FRONTEND";
char * event2 = "perf::STALLED-CYCLES-BACKEND";
char * event3 = "perf::PERF_COUNT_HW_CPU_CYCLES";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
PAPI_event_name_to_code( event3, &EventCode );
events[2] = EventCode;
*/
/*
// LLC Cache Misses (Using native counters)
int events[2];
int EventCode;
char * event1 = "ix86arch::LLC_REFERENCES";
char * event2 = "ix86arch::LLC_MISSES";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
*/
/*
// Node Prefetch Misses (Using native counters)
int events[1];
int EventCode;
//char * event1 = "perf::NODE-PREFETCHES";
//char * event2 = "perf::NODE-PREFETCH-MISSES";
char * event1 = "perf::NODE-PREFETCHES";
char * event2 = "perf::NODE-LOAD-MISSES:COUNT";
//PAPI_event_name_to_code( event1, &EventCode );
//events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[0] = EventCode;
*/
/*
// CPU Stalls Due to lack of Load Buffers (Using native counters)
int events[2];
int EventCode;
char * event1 = "RESOURCE_STALLS:LB";
char * event2 = "perf::PERF_COUNT_HW_CPU_CYCLES";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
*/
/*
// CPU Stalls Due to ANY Resource (Using native counters)
int events[2];
int EventCode;
char * event1 = "RESOURCE_STALLS:ANY";
char * event2 = "PAPI_TOT_CYC";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
*/
/*
// CPU Stalls at Reservation Station (Using native counters)
int events[2];
int EventCode;
char * event1 = "RESOURCE_STALLS:RS";
char * event2 = "perf::PERF_COUNT_HW_CPU_CYCLES";
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
*/
/*
// CPU Stall Reason Breakdown (Using native counters)
int events[4];
int EventCode;
// Set 1
char * event1 = "RESOURCE_STALLS:ANY";
char * event2 = "RESOURCE_STALLS:LB";
char * event3 = "RESOURCE_STALLS:RS";
char * event4 = "RESOURCE_STALLS:SB";
// Set 1
// Set 2
char * event1 = "RESOURCE_STALLS:ANY";
char * event2 = "RESOURCE_STALLS:ROB";
char * event3 = "RESOURCE_STALLS:MEM_RS";
char * event4 = "RESOURCE_STALLS2:ALL_FL_EMPTY";
// Set 2
// Set 3
char * event1 = "RESOURCE_STALLS:ANY";
char * event2 = "RESOURCE_STALLS2:ALL_PRF_CONTROL";
char * event3 = "RESOURCE_STALLS2:ANY_PRF_CONTROL"; // duplicate
char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
// Set 3
char * event1 = "RESOURCE_STALLS:ANY";
char * event2 = "RESOURCE_STALLS:SB";
char * event3 = "RESOURCE_STALLS:RS"; // duplicate
char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
// Events that don't need to be counted
// Don't bother measuring these
//char * event1 = "RESOURCE_STALLS:FCSW"; // Always 0, don't measure
//char * event1 = "RESOURCE_STALLS:MXCSR"; // Always 0, don't measure
//char * event3 = "RESOURCE_STALLS2:BOB_FULL"; // Always trivial
//char * event3 = "RESOURCE_STALLS2:ANY_PRF_CONTROL"; // duplicate
PAPI_event_name_to_code( event1, &EventCode );
events[0] = EventCode;
PAPI_event_name_to_code( event2, &EventCode );
events[1] = EventCode;
PAPI_event_name_to_code( event3, &EventCode );
events[2] = EventCode;
PAPI_event_name_to_code( event4, &EventCode );
events[3] = EventCode;
*/
/////////////////////////////////////////////////////////////////////////
// PAPI EVENT LOADING
/////////////////////////////////////////////////////////////////////////
// Users should not need to alter anything within this section
int thread = omp_get_thread_num();
if ( (stat= PAPI_create_eventset(eventset)) != PAPI_OK)
{
PAPI_perror("PAPI_create_eventset");
exit(1);
}
for( int i = 0; i < *num_papi_events; i++ )
{
if ((stat=PAPI_add_event(*eventset,events[i])) != PAPI_OK)
{
PAPI_perror("PAPI_add_event");
exit(1);
}
}
if ((stat=PAPI_start(*eventset)) != PAPI_OK)
{
PAPI_perror("PAPI_start");
exit(1);
}
}
/*
void counter_init( int *eventset, int *num_papi_events )
{
char error_str[PAPI_MAX_STR_LEN];
// int events[] = {PAPI_TOT_INS,PAPI_BR_INS,PAPI_SR_INS};
int events[] = {PAPI_TOT_INS,PAPI_LD_INS,PAPI_FP_INS};
int events[] = {ix86arch::LLC_REFERENCES,
int stat;
int thread = omp_get_thread_num();
if( thread == 0 )
printf("Initializing PAPI counters...\n");
*num_papi_events = sizeof(events) / sizeof(int);
if ((stat = PAPI_thread_init((long unsigned int (*)(void)) omp_get_thread_num)) != PAPI_OK){
PAPI_perror("PAPI_thread_init");
exit(1);
}
if ( (stat= PAPI_create_eventset(eventset)) != PAPI_OK){
PAPI_perror("PAPI_create_eventset");
exit(1);
}
for( int i = 0; i < *num_papi_events; i++ ){
if ((stat=PAPI_add_event(*eventset,events[i])) != PAPI_OK){
PAPI_perror("PAPI_add_event");
exit(1);
}
}
if ((stat=PAPI_start(*eventset)) != PAPI_OK){
PAPI_perror("PAPI_start");
exit(1);
}
}
*/
// Stops the papi counters and prints results
void counter_stop( int * eventset, int num_papi_events, Input * I )
{
int * events = malloc(num_papi_events * sizeof(int));
int n = num_papi_events;
PAPI_list_events( *eventset, events, &n );
PAPI_event_info_t info;
long_long * values = malloc( num_papi_events * sizeof(long_long));
PAPI_stop(*eventset, values);
int thread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
static long LLC_cache_miss = 0;
static long total_cycles = 0;
static long FLOPS = 0;
static long stall_any = 0;
static long stall_SB = 0;
static long stall_RS = 0;
static long stall_OO = 0;
static long tlb_load = 0;
static long tlb_load_m = 0;
static long tlb_store = 0;
static long tlb_store_m = 0;
#pragma omp master
{
I->vals_accum = malloc( num_papi_events * sizeof(long long));
for(int i=0; i < num_papi_events ; i ++)
I->vals_accum[i] = 0;
}
#pragma omp barrier
#pragma omp critical (papi)
{
printf("Thread %d\n", thread);
for( int i = 0; i < num_papi_events; i++ )
{
I->vals_accum[i] += values[i];
PAPI_get_event_info(events[i], &info);
printf("%-15lld\t%s\t%s\n", values[i],info.symbol,info.long_descr);
if( strcmp(info.symbol, "PAPI_L3_TCM") == 0 )
LLC_cache_miss += values[i];
if( strcmp(info.symbol, "PAPI_TOT_CYC") == 0 )
total_cycles += values[i];
if( strcmp(info.symbol, "PAPI_SP_OPS") == 0 )
FLOPS += values[i];
if( strcmp(info.symbol, "RESOURCE_STALLS:ANY") == 0 )
stall_any += values[i];
if( strcmp(info.symbol, "RESOURCE_STALLS:SB") == 0 )
stall_SB += values[i];
if( strcmp(info.symbol, "RESOURCE_STALLS:RS") == 0 )
stall_RS += values[i];
if( strcmp(info.symbol, "RESOURCE_STALLS2:OOO_RSRC") == 0 )
stall_OO += values[i];
if( strcmp(info.symbol, "perf::DTLB-LOADS") == 0 )
tlb_load += values[i];
if( strcmp(info.symbol, "perf::DTLB-LOAD-MISSES") == 0 )
tlb_load_m += values[i];
if( strcmp(info.symbol, "perf::DTLB-STORES") == 0 )
tlb_store += values[i];
if( strcmp(info.symbol, "perf::DTLB-STORE-MISSES") == 0 )
tlb_store_m += values[i];
}
free(values);
}
{
#pragma omp barrier
}
#pragma omp master
{
if( omp_get_num_threads() > 1){
printf("Thread Totals:\n");
for( int i = 0; i < num_papi_events; i++ )
{
PAPI_get_event_info(events[i], &info);
printf("%-15lld\t%s\t%s\n", I->vals_accum[i],info.symbol,info.long_descr);
}
}
free( I->vals_accum );
border_print();
center_print("PERFORMANCE SUMMARY", 79);
border_print();
long cycles = (long) (total_cycles / (double) nthreads);
double bw = LLC_cache_miss*64./cycles*2.8e9/1024./1024./1024.;
if( I->papi_event_set == 0 )
printf("GFLOPs: %.3lf\n", FLOPS / (double) cycles * 2.8 );
if( I->papi_event_set == 1 )
printf("Bandwidth: %.3lf (GB/s)\n", bw);
if( I->papi_event_set == 2 )
{
printf("%-30s %.2lf%%\n", "Store Buffer Full:",
stall_SB / (double) stall_any * 100.);
printf("%-30s %.2lf%%\n", "Reservation Station Full:",
stall_RS / (double) stall_any * 100.);
printf("%-30s %.2lf%%\n", "OO Pipeline Full:",
stall_OO / (double) stall_any * 100.);
}
if( I->papi_event_set == 3 )
printf("CPU Stalled Cycles: %.2lf%%\n",
stall_any / (double) total_cycles * 100.);
if( I->papi_event_set == 7 )
{
printf("%-30s %.2lf%%\n", "Data TLB Load Miss Rate: ",
tlb_load_m / (double) tlb_load * 100 );
printf("%-30s %.2lf%%\n", "Data TLB Store Miss Rate: ",
tlb_store_m / (double) tlb_store * 100 );
}
border_print();
}
free(events);
}
#endif