Commit a84385fa authored by Simon Marlow's avatar Simon Marlow
Browse files

Refactor PAPI support, and add profiling of multithreaded GC

parent bf4d6a58
......@@ -14,38 +14,25 @@
#ifdef USE_PAPI /* ugly */
#include <papi.h>
#include "Papi.h"
#include "Rts.h"
#include "RtsUtils.h"
#include "Stats.h"
#include "RtsFlags.h"
#include "OSThreads.h"
// used to protect the aggregated counters
#ifdef THREADED_RTS
static Mutex papi_counter_mutex;
#endif
struct _papi_events {
int event_code;
char * event_name;
};
#define PAPI_ADD_EVENT(EVENT) \
{ \
ASSERT(n_papi_events<MAX_PAPI_EVENTS); \
papi_events[n_papi_events].event_code = EVENT; \
papi_events[n_papi_events].event_name = #EVENT; \
n_papi_events++; \
}
/* Report the value of a counter */
#define PAPI_REPORT(EVENTSET,EVENT) \
{ \
ullong_format_string(papi_counter(EVENTSET,EVENT),temp,rtsTrue/*commas*/); \
statsPrintf(" (" #EVENT ") : %s\n",temp); \
}
/* Report the value of a counter as a percentage of another counter */
#define PAPI_REPORT_PCT(EVENTSET,EVENT,EVENTTOT) \
statsPrintf(" (" #EVENT ") %% of (" #EVENTTOT ") : %.1f%%\n", \
papi_counter(EVENTSET,EVENT)*100.0/papi_counter(EVENTSET,EVENTTOT))
/* Beware, these counters are Opteron specific
* I obtained the numbers using the papi_avail
* and papi_native_avail utilities.
......@@ -65,12 +52,16 @@ struct _papi_events {
#define DC_L2_REFILL_MOES 0x40001e1b
#define DC_SYS_REFILL_MOES 0x40001e1c
/* Number of counted events, computed from size of papi_events */
#define N_PAPI_EVENTS n_papi_events
/* This is bad, it should be in a header */
#define BIG_STRING_LEN 512
#define PAPI_CHECK(CALL) \
if((papi_error=(CALL)) != PAPI_OK) { \
debugBelch("PAPI function failed in module %s at line %d with error code %d\n", \
__FILE__,__LINE__,papi_error); \
}
/* While PAPI reporting is going on this flag is on */
int papi_is_reporting;
......@@ -100,6 +91,9 @@ long_long gc_cycles;
static long_long papi_counter(long_long values[],int event);
static void papi_add_events(int EventSet);
/* If you want to add events to count, extend the
* init_countable_events and the papi_report function.
* Be aware that your processor can count a limited number
......@@ -109,31 +103,43 @@ long_long gc_cycles;
static void
init_countable_events(void)
{
#define PAPI_ADD_EVENT(EVENT) \
{ \
if (n_papi_events >= MAX_PAPI_EVENTS) { \
barf("too many PAPI events"); \
} \
papi_events[n_papi_events].event_code = EVENT; \
papi_events[n_papi_events].event_name = #EVENT; \
n_papi_events++; \
}
PAPI_ADD_EVENT(PAPI_TOT_INS);
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
PAPI_ADD_EVENT(FR_BR);
PAPI_ADD_EVENT(FR_BR_MIS);
/* Docs are wrong? Opteron does not count indirect branch misses exclusively */
PAPI_ADD_EVENT(FR_BR_MISCOMPARE);
}
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
} else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
PAPI_ADD_EVENT(FR_DISPATCH_STALLS);
PAPI_ADD_EVENT(FR_DISPATCH_STALLS_BR);
PAPI_ADD_EVENT(FR_DISPATCH_STALLS_FULL_LS);
}
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
} else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
PAPI_ADD_EVENT(PAPI_L1_DCA);
PAPI_ADD_EVENT(PAPI_L1_DCM);
}
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
} else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
PAPI_ADD_EVENT(PAPI_L2_DCA);
PAPI_ADD_EVENT(PAPI_L2_DCM);
}
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
} else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
PAPI_ADD_EVENT(DC_L2_REFILL_MOES);
PAPI_ADD_EVENT(DC_SYS_REFILL_MOES);
PAPI_ADD_EVENT(FR_BR_MIS);
} else {
PAPI_ADD_EVENT(PAPI_STL_ICY);
}
// We might also consider:
// PAPI_BR_MSP Conditional branch instructions mispredicted
// PAPI_RES_STL Cycles stalled on any resource
};
......@@ -154,21 +160,33 @@ papi_gc_cycles()
}
/* This function reports counters for GC and mutator */
void
static void
papi_report(long_long PapiCounters[])
{
/* I need to improve formatting aesthetics */
/* Report the value of a counter */
#define PAPI_REPORT(EVENTSET,EVENT) \
{ \
ullong_format_string(papi_counter(EVENTSET,EVENT),temp,rtsTrue/*commas*/); \
statsPrintf(" (" #EVENT ") : %s\n",temp); \
}
/* Report the value of a counter as a percentage of another counter */
#define PAPI_REPORT_PCT(EVENTSET,EVENT,EVENTTOT) \
statsPrintf(" (" #EVENT ") %% of (" #EVENTTOT ") : %.1f%%\n", \
papi_counter(EVENTSET,EVENT)*100.0/papi_counter(EVENTSET,EVENTTOT))
/* I need to improve formatting aesthetics */
PAPI_REPORT(PapiCounters,PAPI_TOT_INS);
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
PAPI_REPORT(PapiCounters,FR_BR);
PAPI_REPORT(PapiCounters,FR_BR_MIS);
PAPI_REPORT_PCT(PapiCounters,FR_BR_MIS,FR_BR);
PAPI_REPORT_PCT(PapiCounters,FR_BR_MISCOMPARE,FR_BR);
}
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
else if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
PAPI_REPORT(PapiCounters,FR_DISPATCH_STALLS);
PAPI_REPORT(PapiCounters,FR_DISPATCH_STALLS_BR);
//PAPI_REPORT_PCT(PapiCounters,FR_DISPATCH_STALLS_BR,PAPI_TOT_CYC);
......@@ -176,50 +194,87 @@ papi_report(long_long PapiCounters[])
//PAPI_REPORT_PCT(PapiCounters,FR_DISPATCH_STALLS_FULL_LS,PAPI_TOT_CYC);
}
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
PAPI_REPORT(PapiCounters,PAPI_L1_DCA);
PAPI_REPORT(PapiCounters,PAPI_L1_DCM);
PAPI_REPORT_PCT(PapiCounters,PAPI_L1_DCM,PAPI_L1_DCA);
}
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
PAPI_REPORT(PapiCounters,PAPI_L2_DCA);
PAPI_REPORT(PapiCounters,PAPI_L2_DCM);
PAPI_REPORT_PCT(PapiCounters,PAPI_L2_DCM,PAPI_L2_DCA);
}
if(RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
PAPI_REPORT(PapiCounters,DC_L2_REFILL_MOES);
PAPI_REPORT(PapiCounters,DC_SYS_REFILL_MOES);
PAPI_REPORT(PapiCounters,FR_BR_MIS);
}
else {
PAPI_REPORT(PapiCounters,PAPI_STL_ICY);
}
}
void
papi_stats_report (void)
{
statsPrintf(" -- CPU Mutator counters --\n");
papi_mut_cycles();
papi_report(MutatorCounters);
statsPrintf("\n -- CPU GC counters --\n");
papi_gc_cycles();
papi_report(GCCounters);
}
void
papi_init_eventset (int *event_set)
{
PAPI_register_thread();
PAPI_CHECK( PAPI_create_eventset(event_set));
papi_add_events(*event_set);
}
void
papi_init_eventsets(void)
papi_init (void)
{
/* Initialise the performance tracking library */
int ver;
if ((ver = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {
if (ver > 0) {
errorBelch("PAPI_library_init: wrong version: %x", ver);
stg_exit(EXIT_FAILURE);
} else {
sysErrorBelch("PAPI_library_init");
stg_exit(EXIT_FAILURE);
}
}
init_countable_events();
#ifdef THREADED_RTS
{
int err;
if ((err = PAPI_thread_init(osThreadId)) < 0) {
barf("PAPI_thread_init: %d",err);
}
/* One event set for the mutator and another for the GC */
PAPI_CHECK( PAPI_create_eventset(&MutatorEvents));
PAPI_CHECK( PAPI_create_eventset(&GCEvents));
initMutex(&papi_counter_mutex);
}
#endif
/* Both sets contain the same events */
papi_add_events(MutatorEvents);
papi_add_events(GCEvents);
init_countable_events();
papi_init_eventset(&MutatorEvents);
papi_init_eventset(&GCEvents);
}
/* Extract the value corresponding to an event */
long_long
static long_long
papi_counter(long_long values[],int event)
{
int i;
for(i=0;i<N_PAPI_EVENTS;i++) {
for(i=0;i<n_papi_events;i++) {
if(papi_events[i].event_code==event) {
return values[i];
}
......@@ -230,11 +285,11 @@ papi_counter(long_long values[],int event)
}
/* Add the events of papi_events into an event set */
void
static void
papi_add_events(int EventSet)
{
int i;
for(i=0;i<N_PAPI_EVENTS;i++) {
for(i=0;i<n_papi_events;i++) {
if((papi_error=PAPI_add_event(EventSet,
papi_events[i].event_code))
!= PAPI_OK)
......@@ -253,32 +308,57 @@ papi_add_events(int EventSet)
void
papi_start_mutator_count(void)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_start(MutatorEvents));
start_mutator_cycles = PAPI_cycles();
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_stop_mutator_count(void)
{
ACQUIRE_LOCK(&papi_counter_mutex);
mutator_cycles += PAPI_cycles() - start_mutator_cycles;
PAPI_CHECK( PAPI_accum(MutatorEvents,MutatorCounters));
PAPI_CHECK( PAPI_stop(MutatorEvents,NULL));
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_start_gc_count(void)
{
PAPI_CHECK( PAPI_start(GCEvents));
start_gc_cycles = PAPI_cycles();
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_start(GCEvents));
start_gc_cycles = PAPI_cycles();
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_stop_gc_count(void)
{
gc_cycles += PAPI_cycles() - start_gc_cycles;
PAPI_CHECK( PAPI_accum(GCEvents,GCCounters));
PAPI_CHECK( PAPI_stop(GCEvents,NULL));
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_accum(GCEvents,GCCounters));
PAPI_CHECK( PAPI_stop(GCEvents,NULL));
gc_cycles += PAPI_cycles() - start_gc_cycles;
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_thread_start_gc_count(int event_set)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_start(event_set));
RELEASE_LOCK(&papi_counter_mutex);
}
void
papi_thread_stop_gc_count(int event_set)
{
ACQUIRE_LOCK(&papi_counter_mutex);
PAPI_CHECK( PAPI_accum(event_set,GCCounters));
PAPI_CHECK( PAPI_stop(event_set,NULL));
RELEASE_LOCK(&papi_counter_mutex);
}
#endif /* USE_PAPI */
......@@ -5,41 +5,21 @@
*
* ---------------------------------------------------------------------------*/
#include <papi.h>
#define PAPI_CHECK(CALL) \
if((papi_error=(CALL)) != PAPI_OK) { \
debugBelch("PAPI function failed in module %s at line %d with error code %d\n", \
__FILE__,__LINE__,papi_error); \
}
/* Check the error value of a PAPI call, reporting an error, if needed */
extern int papi_error;
/* While PAPI reporting is going on this flag is on */
extern int papi_is_reporting;
/* Event sets and counter arrays for GC and mutator */
extern int MutatorEvents;
extern int GCEvents;
extern long_long MutatorCounters[];
extern long_long GCCounters[];
long_long papi_counter(long_long values[],int event);
void papi_report(long_long PapiCounters[]);
void papi_mut_cycles(void);
void papi_gc_cycles(void);
void papi_add_events(int EventSet);
void papi_init_eventsets(void);
void papi_stats_report(void);
void papi_init_eventset(int * event_set);
void papi_init(void);
void papi_start_mutator_count(void);
void papi_stop_mutator_count(void);
void papi_start_gc_count(void);
void papi_stop_gc_count(void);
// for multithreaded GC, each sub-thread uses these functions to count
// events and aggregate them into the main GC counters.
void papi_thread_start_gc_count(int event_set);
void papi_thread_stop_gc_count(int event_set);
......@@ -167,28 +167,8 @@ hs_init(int *argc, char **argv[])
argv++; argc--;
#endif
/* Initialise the performance tracking library */
#ifdef USE_PAPI
{
int ver;
if ((ver = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {
if (ver > 0) {
errorBelch("PAPI_library_init: wrong version: %x", ver);
stg_exit(EXIT_FAILURE);
} else {
sysErrorBelch("PAPI_library_init");
stg_exit(EXIT_FAILURE);
}
}
}
#ifdef THREADED_RTS
{
int err;
if ((err = PAPI_thread_init(osThreadId)) < 0) {
barf("PAPI_thread_init: %d",err);
}
}
#endif
papi_init();
#endif
/* Set the RTS flags to default values. */
......
......@@ -171,8 +171,6 @@ stat_endInit(void)
InitElapsedTime = elapsed - ElapsedTimeStart;
}
#if USE_PAPI
papi_init_eventsets();
/* We start counting events for the mutator
* when garbage collection starts
* we switch to the GC event set. */
......@@ -606,16 +604,7 @@ stat_exit(int alloc)
*/
#if USE_PAPI
/* PAPI reporting, should put somewhere else?
* Note that the cycles are counted _after_ the initialization of the RTS -- AR */
statsPrintf(" -- CPU Mutator counters --\n");
papi_mut_cycles();
papi_report(MutatorCounters);
statsPrintf("\n -- CPU GC counters --\n");
papi_gc_cycles();
papi_report(GCCounters);
papi_stats_report();
#endif
}
......
......@@ -40,6 +40,7 @@
#include "RetainerProfile.h"
#include "RaiseAsync.h"
#include "Sparks.h"
#include "Papi.h"
#include "GC.h"
#include "Compact.h"
......@@ -873,6 +874,10 @@ alloc_gc_thread (gc_thread *t, int n)
init_gc_thread(t);
#ifdef USE_PAPI
t->papi_events = -1;
#endif
t->steps = stgMallocBytes(RtsFlags.GcFlags.generations *
sizeof(step_workspace *),
"initialise_gc_thread");
......@@ -1011,7 +1016,20 @@ gc_thread_mainloop (void)
gct->wakeup = rtsFalse;
if (gct->exit) break;
#ifdef USE_PAPI
// start performance counters in this thread...
if (gct->papi_events == -1) {
papi_init_eventset(&gct->papi_events);
}
papi_thread_start_gc_count(gct->papi_events);
#endif
gc_thread_work();
#ifdef USE_PAPI
// count events in this thread towards the GC totals
papi_thread_stop_gc_count(gct->papi_events);
#endif
}
}
#endif
......
......@@ -142,6 +142,10 @@ typedef struct gc_thread_ {
lnat thunk_selector_depth; // ummm.... not used as of now
#ifdef USE_PAPI
int papi_events;
#endif
} gc_thread;
extern nat N;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment