Skip to content

Commit 5c4bf04

Browse files
jtljtl
authored andcommitted
MFC r290811:
Fix hwpmc "stalled" behavior Currently, there is a single pm_stalled flag that tracks whether a performance monitor was "stalled" due to insufficent ring buffer space for samples. However, because the same performance monitor can run on multiple processes or threads at the same time, a single pm_stalled flag that impacts them all seems insufficient. In particular, you can hit corner cases where the code fails to stop performance monitors during a context switch out, because it thinks the performance monitor is already stopped. However, in reality, it may be that only the monitor running on a different CPU was stalled. This patch attempts to fix that behavior by tracking on a per-CPU basis whether a PM desires to run and whether it is "stalled". This lets the code make better decisions about when to stop PMs and when to try to restart them. Ideally, we should avoid the case where the code fails to stop a PM during a context switch out. MFC r290813: Optimizations to the way hwpmc gathers user callchains Changes to the code to gather user stacks: * Delay setting pmc_cpumask until we actually have the stack. * When recording user stack traces, only walk the portion of the ring that should have samples for us. MFC r290929: Change the driver stats to what they really are: unsigned values. When pmcstat exits after some samples were dropped, give the user an idea of how many were lost. (Granted, these are global numbers, but they may still help quantify the scope of the loss.) MFC r290930: Improve accuracy of PMC sampling frequency The code tracks a counter which is the number of events until the next sample. On context switch in, it loads the saved counter. On context switch out, it tries to calculate a new saved counter. Problems: 1. The saved counter was shared by all threads in a process. However, this means that all threads would be initially loaded with the same saved counter. However, that could result in sampling more often than once every X number of events. 2. The calculation to determine a new saved counter was backwards. It added when it should have subtracted, and subtracted when it should have added. Assume a single-threaded process with a reload count of 1000 events. Assuming the counter on context switch in was 100 and the counter on context switch out was 50 (meaning the thread has "consumed" 50 more events), the code would calculate a new saved counter of 150 (instead of the proper 50). Fix: 1. As soon as the saved counter is used to initialize a monitor for a thread on context switch in, set the saved counter to the reload count. That way, subsequent threads to use the saved counter will get the full reload count, assuring we sample at least once every X number of events (across all threads). 2. Change the calculation of the saved counter. Due to the change to the saved counter in #1, we simply need to add (modulo the reload count) the remaining counter time we retrieve from the CPU when a thread is context switched out. MFC r291016: Support a wider history counter in pmcstat(8) gmon output pmcstat(8) contains an option to output sampling data in a gmon format compatible with gprof(1). Currently, it uses the default histcounter, which is an (unsigned short). With large sets of sampling data, it is possible to overflow the maximum value provided by an (unsigned short). This change adds the -e argument to pmcstat. If -e and -g are both specified, pmcstat will use a histcounter type of uint64_t. MFC r291017: Fix the date on the pmcstat(8) man page from r291016.
1 parent ba34754 commit 5c4bf04

File tree

7 files changed

+207
-79
lines changed

7 files changed

+207
-79
lines changed

lib/libpmc/pmc.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,15 @@
3636
* Driver statistics.
3737
*/
3838
struct pmc_driverstats {
39-
int pm_intr_ignored; /* #interrupts ignored */
40-
int pm_intr_processed; /* #interrupts processed */
41-
int pm_intr_bufferfull; /* #interrupts with ENOSPC */
42-
int pm_syscalls; /* #syscalls */
43-
int pm_syscall_errors; /* #syscalls with errors */
44-
int pm_buffer_requests; /* #buffer requests */
45-
int pm_buffer_requests_failed; /* #failed buffer requests */
46-
int pm_log_sweeps; /* #sample buffer processing passes */
39+
unsigned int pm_intr_ignored; /* #interrupts ignored */
40+
unsigned int pm_intr_processed; /* #interrupts processed */
41+
unsigned int pm_intr_bufferfull; /* #interrupts with ENOSPC */
42+
unsigned int pm_syscalls; /* #syscalls */
43+
unsigned int pm_syscall_errors; /* #syscalls with errors */
44+
unsigned int pm_buffer_requests; /* #buffer requests */
45+
unsigned int pm_buffer_requests_failed; /* #failed buffer requests */
46+
unsigned int pm_log_sweeps; /* #sample buffer processing
47+
passes */
4748
};
4849

4950
/*

sys/dev/hwpmc/hwpmc_mod.c

Lines changed: 108 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,8 +1287,16 @@ pmc_process_csw_in(struct thread *td)
12871287
*/
12881288
if (PMC_TO_MODE(pm) == PMC_MODE_TS) {
12891289
mtx_pool_lock_spin(pmc_mtxpool, pm);
1290+
1291+
/*
1292+
* Use the saved value calculated after the most recent
1293+
* thread switch out to start this counter. Reset
1294+
* the saved count in case another thread from this
1295+
* process switches in before any threads switch out.
1296+
*/
12901297
newvalue = PMC_PCPU_SAVED(cpu,ri) =
12911298
pp->pp_pmcs[ri].pp_pmcval;
1299+
pp->pp_pmcs[ri].pp_pmcval = pm->pm_sc.pm_reloadcount;
12921300
mtx_pool_unlock_spin(pmc_mtxpool, pm);
12931301
} else {
12941302
KASSERT(PMC_TO_MODE(pm) == PMC_MODE_TC,
@@ -1303,6 +1311,15 @@ pmc_process_csw_in(struct thread *td)
13031311
PMCDBG3(CSW,SWI,1,"cpu=%d ri=%d new=%jd", cpu, ri, newvalue);
13041312

13051313
pcd->pcd_write_pmc(cpu, adjri, newvalue);
1314+
1315+
/* If a sampling mode PMC, reset stalled state. */
1316+
if (PMC_TO_MODE(pm) == PMC_MODE_TS)
1317+
CPU_CLR_ATOMIC(cpu, &pm->pm_stalled);
1318+
1319+
/* Indicate that we desire this to run. */
1320+
CPU_SET_ATOMIC(cpu, &pm->pm_cpustate);
1321+
1322+
/* Start the PMC. */
13061323
pcd->pcd_start_pmc(cpu, adjri);
13071324
}
13081325

@@ -1397,8 +1414,14 @@ pmc_process_csw_out(struct thread *td)
13971414
("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
13981415
__LINE__, PMC_TO_ROWINDEX(pm), ri));
13991416

1400-
/* Stop hardware if not already stopped */
1401-
if (pm->pm_stalled == 0)
1417+
/*
1418+
* Change desired state, and then stop if not stalled.
1419+
* This two-step dance should avoid race conditions where
1420+
* an interrupt re-enables the PMC after this code has
1421+
* already checked the pm_stalled flag.
1422+
*/
1423+
CPU_CLR_ATOMIC(cpu, &pm->pm_cpustate);
1424+
if (!CPU_ISSET(cpu, &pm->pm_stalled))
14021425
pcd->pcd_stop_pmc(cpu, adjri);
14031426

14041427
/* reduce this PMC's runcount */
@@ -1421,31 +1444,43 @@ pmc_process_csw_out(struct thread *td)
14211444

14221445
pcd->pcd_read_pmc(cpu, adjri, &newvalue);
14231446

1424-
tmp = newvalue - PMC_PCPU_SAVED(cpu,ri);
1425-
1426-
PMCDBG3(CSW,SWO,1,"cpu=%d ri=%d tmp=%jd", cpu, ri,
1427-
tmp);
1428-
14291447
if (mode == PMC_MODE_TS) {
1448+
PMCDBG3(CSW,SWO,1,"cpu=%d ri=%d tmp=%jd (samp)",
1449+
cpu, ri, PMC_PCPU_SAVED(cpu,ri) - newvalue);
14301450

14311451
/*
14321452
* For sampling process-virtual PMCs,
1433-
* we expect the count to be
1434-
* decreasing as the 'value'
1435-
* programmed into the PMC is the
1436-
* number of events to be seen till
1437-
* the next sampling interrupt.
1453+
* newvalue is the number of events to be seen
1454+
* until the next sampling interrupt.
1455+
* We can just add the events left from this
1456+
* invocation to the counter, then adjust
1457+
* in case we overflow our range.
1458+
*
1459+
* (Recall that we reload the counter every
1460+
* time we use it.)
14381461
*/
1439-
if (tmp < 0)
1440-
tmp += pm->pm_sc.pm_reloadcount;
14411462
mtx_pool_lock_spin(pmc_mtxpool, pm);
1442-
pp->pp_pmcs[ri].pp_pmcval -= tmp;
1443-
if ((int64_t) pp->pp_pmcs[ri].pp_pmcval <= 0)
1444-
pp->pp_pmcs[ri].pp_pmcval +=
1463+
1464+
pp->pp_pmcs[ri].pp_pmcval += newvalue;
1465+
if (pp->pp_pmcs[ri].pp_pmcval >
1466+
pm->pm_sc.pm_reloadcount)
1467+
pp->pp_pmcs[ri].pp_pmcval -=
14451468
pm->pm_sc.pm_reloadcount;
1469+
KASSERT(pp->pp_pmcs[ri].pp_pmcval > 0 &&
1470+
pp->pp_pmcs[ri].pp_pmcval <=
1471+
pm->pm_sc.pm_reloadcount,
1472+
("[pmc,%d] pp_pmcval outside of expected "
1473+
"range cpu=%d ri=%d pp_pmcval=%jx "
1474+
"pm_reloadcount=%jx", __LINE__, cpu, ri,
1475+
pp->pp_pmcs[ri].pp_pmcval,
1476+
pm->pm_sc.pm_reloadcount));
14461477
mtx_pool_unlock_spin(pmc_mtxpool, pm);
14471478

14481479
} else {
1480+
tmp = newvalue - PMC_PCPU_SAVED(cpu,ri);
1481+
1482+
PMCDBG3(CSW,SWO,1,"cpu=%d ri=%d tmp=%jd (count)",
1483+
cpu, ri, tmp);
14491484

14501485
/*
14511486
* For counting process-virtual PMCs,
@@ -2263,8 +2298,9 @@ pmc_release_pmc_descriptor(struct pmc *pm)
22632298
pmc_select_cpu(cpu);
22642299

22652300
/* switch off non-stalled CPUs */
2301+
CPU_CLR_ATOMIC(cpu, &pm->pm_cpustate);
22662302
if (pm->pm_state == PMC_STATE_RUNNING &&
2267-
pm->pm_stalled == 0) {
2303+
!CPU_ISSET(cpu, &pm->pm_stalled)) {
22682304

22692305
phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
22702306

@@ -2678,8 +2714,15 @@ pmc_start(struct pmc *pm)
26782714
if ((error = pcd->pcd_write_pmc(cpu, adjri,
26792715
PMC_IS_SAMPLING_MODE(mode) ?
26802716
pm->pm_sc.pm_reloadcount :
2681-
pm->pm_sc.pm_initial)) == 0)
2717+
pm->pm_sc.pm_initial)) == 0) {
2718+
/* If a sampling mode PMC, reset stalled state. */
2719+
if (PMC_IS_SAMPLING_MODE(mode))
2720+
CPU_CLR_ATOMIC(cpu, &pm->pm_stalled);
2721+
2722+
/* Indicate that we desire this to run. Start it. */
2723+
CPU_SET_ATOMIC(cpu, &pm->pm_cpustate);
26822724
error = pcd->pcd_start_pmc(cpu, adjri);
2725+
}
26832726
critical_exit();
26842727

26852728
pmc_restore_cpu_binding(&pb);
@@ -2741,6 +2784,7 @@ pmc_stop(struct pmc *pm)
27412784
ri = PMC_TO_ROWINDEX(pm);
27422785
pcd = pmc_ri_to_classdep(md, ri, &adjri);
27432786

2787+
CPU_CLR_ATOMIC(cpu, &pm->pm_cpustate);
27442788
critical_enter();
27452789
if ((error = pcd->pcd_stop_pmc(cpu, adjri)) == 0)
27462790
error = pcd->pcd_read_pmc(cpu, adjri, &pm->pm_sc.pm_initial);
@@ -4049,12 +4093,13 @@ pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf,
40494093

40504094
ps = psb->ps_write;
40514095
if (ps->ps_nsamples) { /* in use, reader hasn't caught up */
4052-
pm->pm_stalled = 1;
4096+
CPU_SET_ATOMIC(cpu, &pm->pm_stalled);
40534097
atomic_add_int(&pmc_stats.pm_intr_bufferfull, 1);
40544098
PMCDBG6(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d",
40554099
cpu, pm, (void *) tf, inuserspace,
40564100
(int) (psb->ps_write - psb->ps_samples),
40574101
(int) (psb->ps_read - psb->ps_samples));
4102+
callchaindepth = 1;
40584103
error = ENOMEM;
40594104
goto done;
40604105
}
@@ -4112,7 +4157,8 @@ pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf,
41124157

41134158
done:
41144159
/* mark CPU as needing processing */
4115-
CPU_SET_ATOMIC(cpu, &pmc_cpumask);
4160+
if (callchaindepth != PMC_SAMPLE_INUSE)
4161+
CPU_SET_ATOMIC(cpu, &pmc_cpumask);
41164162

41174163
return (error);
41184164
}
@@ -4126,10 +4172,9 @@ pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf,
41264172
static void
41274173
pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf)
41284174
{
4129-
int i;
41304175
struct pmc *pm;
41314176
struct thread *td;
4132-
struct pmc_sample *ps;
4177+
struct pmc_sample *ps, *ps_end;
41334178
struct pmc_samplebuffer *psb;
41344179
#ifdef INVARIANTS
41354180
int ncallchains;
@@ -4148,15 +4193,17 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf)
41484193

41494194
/*
41504195
* Iterate through all deferred callchain requests.
4196+
* Walk from the current read pointer to the current
4197+
* write pointer.
41514198
*/
41524199

4153-
ps = psb->ps_samples;
4154-
for (i = 0; i < pmc_nsamples; i++, ps++) {
4155-
4200+
ps = psb->ps_read;
4201+
ps_end = psb->ps_write;
4202+
do {
41564203
if (ps->ps_nsamples != PMC_SAMPLE_INUSE)
4157-
continue;
4204+
goto next;
41584205
if (ps->ps_td != td)
4159-
continue;
4206+
goto next;
41604207

41614208
KASSERT(ps->ps_cpu == cpu,
41624209
("[pmc,%d] cpu mismatch ps_cpu=%d pcpu=%d", __LINE__,
@@ -4181,7 +4228,12 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf)
41814228
#ifdef INVARIANTS
41824229
ncallchains++;
41834230
#endif
4184-
}
4231+
4232+
next:
4233+
/* increment the pointer, modulo sample ring size */
4234+
if (++ps == psb->ps_fence)
4235+
ps = psb->ps_samples;
4236+
} while (ps != ps_end);
41854237

41864238
KASSERT(ncallchains > 0,
41874239
("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__,
@@ -4191,6 +4243,9 @@ pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf)
41914243
("[pmc,%d] invalid td_pinned value", __LINE__));
41924244
sched_unpin(); /* Can migrate safely now. */
41934245

4246+
/* mark CPU as needing processing */
4247+
CPU_SET_ATOMIC(cpu, &pmc_cpumask);
4248+
41944249
return;
41954250
}
41964251

@@ -4304,10 +4359,11 @@ pmc_process_samples(int cpu, int ring)
43044359
if (pm == NULL || /* !cfg'ed */
43054360
pm->pm_state != PMC_STATE_RUNNING || /* !active */
43064361
!PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) || /* !sampling */
4307-
pm->pm_stalled == 0) /* !stalled */
4362+
!CPU_ISSET(cpu, &pm->pm_cpustate) || /* !desired */
4363+
!CPU_ISSET(cpu, &pm->pm_stalled)) /* !stalled */
43084364
continue;
43094365

4310-
pm->pm_stalled = 0;
4366+
CPU_CLR_ATOMIC(cpu, &pm->pm_stalled);
43114367
(*pcd->pcd_start_pmc)(cpu, adjri);
43124368
}
43134369
}
@@ -4426,23 +4482,31 @@ pmc_process_exit(void *arg __unused, struct proc *p)
44264482
("[pmc,%d] pm %p != pp_pmcs[%d] %p",
44274483
__LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc));
44284484

4429-
(void) pcd->pcd_stop_pmc(cpu, adjri);
4430-
44314485
KASSERT(pm->pm_runcount > 0,
44324486
("[pmc,%d] bad runcount ri %d rc %d",
44334487
__LINE__, ri, pm->pm_runcount));
44344488

4435-
/* Stop hardware only if it is actually running */
4436-
if (pm->pm_state == PMC_STATE_RUNNING &&
4437-
pm->pm_stalled == 0) {
4438-
pcd->pcd_read_pmc(cpu, adjri, &newvalue);
4439-
tmp = newvalue -
4440-
PMC_PCPU_SAVED(cpu,ri);
4441-
4442-
mtx_pool_lock_spin(pmc_mtxpool, pm);
4443-
pm->pm_gv.pm_savedvalue += tmp;
4444-
pp->pp_pmcs[ri].pp_pmcval += tmp;
4445-
mtx_pool_unlock_spin(pmc_mtxpool, pm);
4489+
/*
4490+
* Change desired state, and then stop if not
4491+
* stalled. This two-step dance should avoid
4492+
* race conditions where an interrupt re-enables
4493+
* the PMC after this code has already checked
4494+
* the pm_stalled flag.
4495+
*/
4496+
if (CPU_ISSET(cpu, &pm->pm_cpustate)) {
4497+
CPU_CLR_ATOMIC(cpu, &pm->pm_cpustate);
4498+
if (!CPU_ISSET(cpu, &pm->pm_stalled)) {
4499+
(void) pcd->pcd_stop_pmc(cpu, adjri);
4500+
pcd->pcd_read_pmc(cpu, adjri,
4501+
&newvalue);
4502+
tmp = newvalue -
4503+
PMC_PCPU_SAVED(cpu,ri);
4504+
4505+
mtx_pool_lock_spin(pmc_mtxpool, pm);
4506+
pm->pm_gv.pm_savedvalue += tmp;
4507+
pp->pp_pmcs[ri].pp_pmcval += tmp;
4508+
mtx_pool_unlock_spin(pmc_mtxpool, pm);
4509+
}
44464510
}
44474511

44484512
atomic_subtract_rel_int(&pm->pm_runcount,1);

sys/sys/pmc.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -534,14 +534,15 @@ struct pmc_op_configurelog {
534534
*/
535535

536536
struct pmc_op_getdriverstats {
537-
int pm_intr_ignored; /* #interrupts ignored */
538-
int pm_intr_processed; /* #interrupts processed */
539-
int pm_intr_bufferfull; /* #interrupts with ENOSPC */
540-
int pm_syscalls; /* #syscalls */
541-
int pm_syscall_errors; /* #syscalls with errors */
542-
int pm_buffer_requests; /* #buffer requests */
543-
int pm_buffer_requests_failed; /* #failed buffer requests */
544-
int pm_log_sweeps; /* #sample buffer processing passes */
537+
unsigned int pm_intr_ignored; /* #interrupts ignored */
538+
unsigned int pm_intr_processed; /* #interrupts processed */
539+
unsigned int pm_intr_bufferfull; /* #interrupts with ENOSPC */
540+
unsigned int pm_syscalls; /* #syscalls */
541+
unsigned int pm_syscall_errors; /* #syscalls with errors */
542+
unsigned int pm_buffer_requests; /* #buffer requests */
543+
unsigned int pm_buffer_requests_failed; /* #failed buffer requests */
544+
unsigned int pm_log_sweeps; /* #sample buffer processing
545+
passes */
545546
};
546547

547548
/*
@@ -598,6 +599,7 @@ struct pmc_op_getdyneventinfo {
598599

599600
#include <sys/malloc.h>
600601
#include <sys/sysctl.h>
602+
#include <sys/_cpuset.h>
601603

602604
#include <machine/frame.h>
603605

@@ -713,7 +715,8 @@ struct pmc {
713715
pmc_value_t pm_initial; /* counting PMC modes */
714716
} pm_sc;
715717

716-
uint32_t pm_stalled; /* marks stalled sampling PMCs */
718+
volatile cpuset_t pm_stalled; /* marks stalled sampling PMCs */
719+
volatile cpuset_t pm_cpustate; /* CPUs where PMC should be active */
717720
uint32_t pm_caps; /* PMC capabilities */
718721
enum pmc_event pm_event; /* event being measured */
719722
uint32_t pm_flags; /* additional flags PMC_F_... */

0 commit comments

Comments
 (0)