-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathraw.c
105 lines (97 loc) · 3.37 KB
/
raw.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
* Copyright 2019 Oleksandr Kuvshynov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "../include/b63/b63.h"
#include "../include/b63/counters/perf_events.h"
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
/*
* This example illustrates using raw events from processor PMU.
* In Linux perf_events that's achieved by passing combination
* of 'mask' and 'event'. Those are CPU-specific, so, this example
* might need to be run differently or not work at all on different cpus.
* Still, it provides a decent illustration on how to use them (and why).
*
* In b63, -c lpe:r<mask><event> needs to be passed as command-line argument
* in order to read counter value.
* For example, on old Core2 Intel processor from 2009:
*
* mask = 0x01 && event = 0xA1 corresponds to 'uops executed on port 0'
* mask = 0x02 && event = 0xA1 corresponds to 'uops executed on port 1'
* mask = 0x04 && event = 0xA1 corresponds to 'uops executed on port 2'
* mask = 0x08 && event = 0xA1 corresponds to 'uops executed on port 3'
* mask = 0x10 && event = 0xA1 corresponds to 'uops executed on port 4'
* mask = 0x20 && event = 0xA1 corresponds to 'uops executed on port 5'
*
* memory load is executed on port 2 and memory (data) store on port 4.
*
* Thus, running this benchmark produces result like this:
*
$ examples/_build/bm_raw -c lpe:r04A1 -i
Running 2 benchmarks
[DONE] many_writes : 32.571429 events per iteration
[DONE] many_reads : 285749.285714 events per iteration
$ examples/_build/bm_raw -c lpe:r10A1 -i
Running 2 benchmarks
[DONE] many_writes : 500017.428571 events per iteration
[DONE] many_reads : 10.000000 events per iteration
*
* For more information on 'which instructions run where' and 'which codes
correspond to which events' please refer to:
* - Intel Optimization manual:
https://software.intel.com/en-us/download/intel-64-and-ia-32-architectures-optimization-reference-manual
* - Agner Fog's manual, specifically 'microarchitecture' volume:
https://www.agner.org/optimize/#manuals
*
*/
const int32_t kSize = 500000;
const int32_t kLookups = 50000;
B63_BENCHMARK(many_reads, n) {
int32_t *v = NULL;
int32_t i = 0, res = 0, j;
B63_SUSPEND {
v = malloc(kSize * sizeof(int32_t));
for (i = 0; i < kSize; i++) {
v[i] = rand();
}
}
for (j = 0; j < n; j++)
for (i = 0; i < kLookups; i++) {
res += v[i];
}
B63_KEEP(res);
}
B63_BENCHMARK(many_writes, n) {
int32_t *v = NULL;
int32_t i = 0, res = 0, j;
B63_SUSPEND {
v = malloc(kSize * sizeof(int32_t));
for (i = 0; i < kSize; i++) {
v[i] = rand();
}
}
for (j = 0; j < n; j++)
for (i = 0; i < kLookups; i++) {
v[i] = res;
res += i + j;
}
B63_KEEP(res);
}
int main(int argc, char **argv) {
srand(time(0));
B63_RUN(argc, argv);
return 0;
}