ACK: [PATCH 08/11] fwts: Use linux perf counters for cpu benchmarking
Colin Ian King
colin.king at canonical.com
Tue May 26 09:40:48 UTC 2015
On 21/05/15 10:34, Jeremy Kerr wrote:
> Currently, we use a loop counter to measure cpu performance. There are a
> couple of drawbacks of this:
>
> 1) the numbers are fairly arbitrary, and don't reflect any external
> values; and
>
> 2) the results are subject to peturbation from other tasks.
>
> This change uses the linux perf counters to measure actual CPU cycles,
> where available. This means we get numbers that are meaningful, and will
> we'll get the performance value even if our process has been scheduled
> off the CPU.
>
> Now, the new results should match the actual cpu frequencies:
>
> CPU 0: 16 CPU frequency steps supported.
> Frequency | Relative Speed | Cycles | Bogo loops
> -----------+----------------+------------+-----------
> 1.200 GHz | 36.5 % | 1197262295 | 114643
> 1.300 GHz | 39.5 % | 1297048295 | 124214
> 1.400 GHz | 42.6 % | 1396813066 | 133889
> 1.500 GHz | 45.6 % | 1496592732 | 143456
> 1.600 GHz | 48.6 % | 1596393751 | 150124
> 1.700 GHz | 51.7 % | 1696137977 | 162350
> 1.800 GHz | 54.7 % | 1795918438 | 172071
> 1.900 GHz | 57.8 % | 1895685372 | 181249
> 2.000 GHz | 60.8 % | 1995453204 | 191176
> 2.100 GHz | 63.8 % | 2095225345 | 200753
> 2.200 GHz | 66.9 % | 2194993645 | 210282
> 2.300 GHz | 69.9 % | 2294780535 | 219945
> 2.400 GHz | 73.0 % | 2394544770 | 229664
> 2.500 GHz | 76.0 % | 2494315101 | 239121
> 2.600 GHz | 79.0 % | 2594055116 | 248536
> 2.601 GHz | 100.0 % | 3281891417 | 314546
>
> CPU 0 performance scaling OK
>
> Because we still want benchmarks to work when perf counters are not
> available, we still measure the loop count. This means we need to
> represent test results as a struct, with cycles and loops members.
>
> When tests need a scalar performance value,
> fwts_cpu_benchmark_get_best_result will return the most accurate result
> measured.
>
> Signed-off-by: Jeremy Kerr <jk at ozlabs.org>
>
> ---
> src/acpi/cstates/cstates.c | 4 -
> src/cpu/cpufreq/cpufreq.c | 63 +++++++++++++--------
> src/lib/include/fwts_cpu.h | 11 +++
> src/lib/src/fwts_cpu.c | 107 ++++++++++++++++++++++++++++++++++---
> 4 files changed, 153 insertions(+), 32 deletions(-)
>
> diff --git a/src/acpi/cstates/cstates.c b/src/acpi/cstates/cstates.c
> index b58f15c..42c634c 100644
> --- a/src/acpi/cstates/cstates.c
> +++ b/src/acpi/cstates/cstates.c
> @@ -133,9 +133,9 @@ static void do_cpu(fwts_framework *fw, int nth, int cpus, int cpu, char *path)
> if ((i & 7) < 4)
> sleep(1);
> else {
> - uint64_t loop_count;
> + fwts_cpu_benchmark_result result;
>
> - if (fwts_cpu_performance(fw, cpu, &loop_count) != FWTS_OK) {
> + if (fwts_cpu_benchmark(fw, cpu, &result) != FWTS_OK) {
> fwts_failed(fw, LOG_LEVEL_HIGH, "CPUFailedPerformance",
> "Could not determine the CPU performance, this "
> "may be due to not being able to get or set the "
> diff --git a/src/cpu/cpufreq/cpufreq.c b/src/cpu/cpufreq/cpufreq.c
> index 979a9e1..e409375 100644
> --- a/src/cpu/cpufreq/cpufreq.c
> +++ b/src/cpu/cpufreq/cpufreq.c
> @@ -45,7 +45,7 @@
>
> typedef struct {
> uint64_t Hz;
> - uint64_t speed;
> + fwts_cpu_benchmark_result perf;
> } fwts_cpu_freq;
>
> struct cpu {
> @@ -149,6 +149,7 @@ static int get_performance_repeat(
> const int type,
> uint64_t *retval)
> {
> + fwts_cpu_benchmark_result result;
> int i;
>
> uint64_t max = 0;
> @@ -159,9 +160,10 @@ static int get_performance_repeat(
> for (i = 0; i < count; i++) {
> uint64_t temp;
>
> - if (fwts_cpu_performance(fw, cpu->idx, &temp) != FWTS_OK)
> + if (fwts_cpu_benchmark(fw, cpu->idx, &result) != FWTS_OK)
> return FWTS_ERROR;
>
> + temp = fwts_cpu_benchmark_best_result(&result);
> if (temp) {
> if (temp < min)
> min = temp;
> @@ -276,17 +278,19 @@ static int test_one_cpu_performance(fwts_framework *fw, struct cpu *cpu,
> int i;
>
> for (i = 0; i < cpu->n_freqs; i++) {
> + uint64_t perf;
> +
> cpu_set_frequency(fw, cpu, cpu->freqs[i].Hz);
>
> - if (fwts_cpu_performance(fw, cpu->idx, &cpu->freqs[i].speed)
> + if (fwts_cpu_benchmark(fw, cpu->idx, &cpu->freqs[i].perf)
> != FWTS_OK) {
> fwts_log_error(fw, "Failed to get CPU performance for "
> "CPU frequency %" PRId64 " Hz.",
> cpu->freqs[i].Hz);
> - cpu->freqs[i].speed = 0;
> }
> - if (cpu->freqs[i].speed > cpu_top_perf)
> - cpu_top_perf = cpu->freqs[i].speed;
> + perf = fwts_cpu_benchmark_best_result(&cpu->freqs[i].perf);
> + if (perf > cpu_top_perf)
> + cpu_top_perf = perf;
>
> fwts_progress(fw, (100 * ((cpu_idx * cpu->n_freqs) + i)) /
> (n_online_cpus * cpu->n_freqs));
> @@ -294,37 +298,46 @@ static int test_one_cpu_performance(fwts_framework *fw, struct cpu *cpu,
>
> fwts_log_info(fw, "CPU %d: %i CPU frequency steps supported.",
> cpu->idx, cpu->n_freqs);
> - fwts_log_info_verbatum(fw, " Frequency | Relative Speed | Bogo loops");
> - fwts_log_info_verbatum(fw, "-----------+----------------+-----------");
> + fwts_log_info_verbatum(fw,
> + " Frequency | Relative Speed | Cycles | Bogo loops");
> + fwts_log_info_verbatum(fw,
> + "-----------+----------------+------------+-----------");
> for (i = 0; i < cpu->n_freqs; i++) {
> char *turbo = "";
> #ifdef FWTS_ARCH_INTEL
> if ((i == 0) && (cpu->n_freqs > 1) &&
> - (hz_almost_equal(cpu->freqs[i].Hz, cpu->freqs[i + 1].Hz)))
> + (hz_almost_equal(cpu->freqs[i].Hz, cpu->freqs[i + 1].Hz)))
> turbo = " (Turbo Boost)";
> #endif
> - fwts_log_info_verbatum(fw, "%10s | %5.1f %% | %9" PRIu64
> - "%s",
> + uint64_t perf = fwts_cpu_benchmark_best_result(
> + &cpu->freqs[i].perf);
> + fwts_log_info_verbatum(fw,
> + "%10s | %5.1f %% "
> + "| %10" PRIu64 " | %9" PRIu64 "%s",
> hz_to_human(cpu->freqs[i].Hz),
> - 100.0 * cpu->freqs[i].speed / cpu_top_perf,
> - cpu->freqs[i].speed, turbo);
> + 100.0 * perf / cpu_top_perf,
> + cpu->freqs[i].perf.cycles,
> + cpu->freqs[i].perf.loops,
> + turbo);
> }
>
> fwts_log_nl(fw);
>
> /* now check for increasing performance */
> for (i = 0; i < cpu->n_freqs - 1; i++) {
> - if (cpu->freqs[i].speed <= cpu->freqs[i+1].speed)
> + uint64_t perf, last_perf;
> +
> + last_perf = fwts_cpu_benchmark_best_result(&cpu->freqs[i].perf);
> + perf = fwts_cpu_benchmark_best_result(&cpu->freqs[i+1].perf);
> + if (last_perf <= perf)
> continue;
>
> fwts_log_warning(fw,
> "Supposedly higher frequency %s is slower (%" PRIu64
> - " bogo loops) than frequency %s (%" PRIu64
> - " bogo loops) on CPU %i.",
> - hz_to_human(cpu->freqs[i+1].Hz),
> - cpu->freqs[i+1].speed,
> - hz_to_human(cpu->freqs[i].Hz),
> - cpu->freqs[i].speed,
> + ") than frequency %s (%" PRIu64
> + ") on CPU %i.",
> + hz_to_human(cpu->freqs[i+1].Hz), perf,
> + hz_to_human(cpu->freqs[i].Hz), last_perf,
> cpu->idx);
> return FWTS_ERROR;
> }
> @@ -459,6 +472,7 @@ static int cpufreq_test_sw_any(fwts_framework *fw)
> {
> uint64_t low_perf, high_perf, newhigh_perf;
> int i, j, rc, n_tests, performed_tests;
> + fwts_cpu_benchmark_result result;
> bool ok;
>
> rc = sw_tests_possible(fw);
> @@ -478,12 +492,13 @@ static int cpufreq_test_sw_any(fwts_framework *fw)
> cpu_set_lowest_frequency(fw, &cpus[i]);
>
> /* assume that all processors have the same low performance */
> - if (fwts_cpu_performance(fw, cpus[0].idx, &low_perf) != FWTS_OK) {
> + if (fwts_cpu_benchmark(fw, cpus[0].idx, &result) != FWTS_OK) {
> fwts_failed(fw, LOG_LEVEL_MEDIUM,
> "CPUFreqCPsSetToSW_ANYGetPerf",
> "Cannot get CPU performance.");
> return FWTS_ERROR;
> }
> + low_perf = fwts_cpu_benchmark_best_result(&result);
>
> ok = true;
>
> @@ -497,12 +512,13 @@ static int cpufreq_test_sw_any(fwts_framework *fw)
> if (!cpu->online)
> continue;
>
> - if (fwts_cpu_performance(fw, cpu->idx, &high_perf) != FWTS_OK) {
> + if (fwts_cpu_benchmark(fw, cpu->idx, &result) != FWTS_OK) {
> fwts_failed(fw, LOG_LEVEL_MEDIUM,
> "CPUFreqCPsSetToSW_ANYGetPerf",
> "Cannot get CPU performance.");
> return FWTS_ERROR;
> }
> + high_perf = fwts_cpu_benchmark_best_result(&result);
>
> performed_tests++;
> fwts_progress(fw, 100 * performed_tests/n_tests);
> @@ -514,13 +530,14 @@ static int cpufreq_test_sw_any(fwts_framework *fw)
> for (j = 0; j < num_cpus; j++)
> if (i != j)
> cpu_set_lowest_frequency(fw, &cpus[j]);
> - if (fwts_cpu_performance(fw, cpu->idx, &newhigh_perf)
> + if (fwts_cpu_benchmark(fw, cpu->idx, &result)
> != FWTS_OK) {
> fwts_failed(fw, LOG_LEVEL_MEDIUM,
> "CPUFreqCPsSetToSW_ANYGetPerf",
> "Cannot get CPU performance.");
> return FWTS_ERROR;
> }
> + newhigh_perf = fwts_cpu_benchmark_best_result(&result);
> if ((high_perf > newhigh_perf) &&
> (high_perf - newhigh_perf > (high_perf - low_perf)/4) &&
> (high_perf - low_perf > 20)) {
> diff --git a/src/lib/include/fwts_cpu.h b/src/lib/include/fwts_cpu.h
> index b132697..7162316 100644
> --- a/src/lib/include/fwts_cpu.h
> +++ b/src/lib/include/fwts_cpu.h
> @@ -33,6 +33,12 @@ typedef struct cpuinfo_x86 {
> char *flags; /* String containing flags */
> } fwts_cpuinfo_x86;
>
> +typedef struct cpu_benchmark_result {
> + bool cycles_valid;
> + uint64_t loops;
> + uint64_t cycles;
> +} fwts_cpu_benchmark_result;
> +
> int fwts_cpu_readmsr(const int cpu, const uint32_t reg, uint64_t *val);
>
> int fwts_cpu_is_Intel(bool *is_intel);
> @@ -46,6 +52,9 @@ int fwts_cpu_enumerate(void);
> int fwts_cpu_consume(const int seconds);
> int fwts_cpu_consume_start(void);
> void fwts_cpu_consume_complete(void);
> -int fwts_cpu_performance(fwts_framework *fw, const int cpu, uint64_t *loop_count);
> +int fwts_cpu_benchmark(fwts_framework *fw, const int cpu,
> + fwts_cpu_benchmark_result *result);
> +
> +uint64_t fwts_cpu_benchmark_best_result(fwts_cpu_benchmark_result *res);
>
> #endif
> diff --git a/src/lib/src/fwts_cpu.c b/src/lib/src/fwts_cpu.c
> index 75b1100..a7cfd3d 100644
> --- a/src/lib/src/fwts_cpu.c
> +++ b/src/lib/src/fwts_cpu.c
> @@ -26,8 +26,10 @@
> #include <limits.h>
> #include <string.h>
> #include <dirent.h>
> +#include <sys/ioctl.h>
> #include <sys/stat.h>
> #include <sys/types.h>
> +#include <sys/syscall.h>
> #include <sys/wait.h>
> #include <signal.h>
> #include <fcntl.h>
> @@ -35,6 +37,8 @@
> #include <sched.h>
> #include <time.h>
>
> +#include <linux/perf_event.h>
> +
> #include "fwts_types.h"
> #include "fwts_cpu.h"
> #include "fwts_pipeio.h"
> @@ -312,20 +316,73 @@ static void fwts_cpu_burn_cycles(void)
> }
> }
>
> +static int perf_setup_counter(int cpu)
> +{
> + struct perf_event_attr attr;
> + int fd;
> +
> + memset(&attr, 0, sizeof(attr));
> + attr.type = PERF_TYPE_HARDWARE;
> + attr.config = PERF_COUNT_HW_CPU_CYCLES;
> + attr.disabled = 1;
> + attr.size = sizeof(attr);
> +
> + fd = syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0);
> + return fd;
> +}
> +
> +static int perf_start_counter(int fd)
> +{
> + int rc;
> +
> + rc = ioctl(fd, PERF_EVENT_IOC_ENABLE);
> + return rc == 0 ? FWTS_OK : FWTS_ERROR;
> +}
> +
> +static int perf_stop_counter(int fd)
> +{
> + int rc;
> +
> + rc = ioctl(fd, PERF_EVENT_IOC_DISABLE);
> + return rc == 0 ? FWTS_OK : FWTS_ERROR;
> +}
> +
> +static int perf_read_counter(int fd, unsigned long long *result)
> +{
> + unsigned long long buf;
> + int rc;
> +
> + rc = read(fd, &buf, sizeof(buf));
> + if (rc == sizeof(buf)) {
> + *result = buf;
> + rc = FWTS_OK;
> + } else {
> + rc = FWTS_ERROR;
> + }
> +
> + close(fd);
> + return rc;
> +}
> +
> /*
> - * fwts_cpu_performance()
> + * fwts_cpu_benchmark()
> *
> */
> -int fwts_cpu_performance(
> +int fwts_cpu_benchmark(
> fwts_framework *fw,
> const int cpu, /* CPU we want to measure performance */
> - uint64_t *loop_count) /* Returned measure of bogo compute power */
> + fwts_cpu_benchmark_result *result)
> {
> + unsigned long long perfctr_result;
> + fwts_cpu_benchmark_result tmp;
> cpu_set_t mask, oldset;
> + int perfctr, ncpus, rc;
> + static bool warned;
> time_t current;
> - int ncpus = fwts_cpu_enumerate();
> + bool perf_ok;
>
> - *loop_count = 0;
> + ncpus = fwts_cpu_enumerate();
> + memset(&tmp, 0, sizeof(tmp));
>
> if (ncpus == FWTS_ERROR)
> return FWTS_ERROR;
> @@ -333,6 +390,20 @@ int fwts_cpu_performance(
> if (cpu < 0 || cpu > ncpus)
> return FWTS_ERROR;
>
> + /* setup perf counter */
> + perf_ok = true;
> + perfctr = perf_setup_counter(cpu);
> + if (perfctr < 0) {
> + if (!warned) {
> + fwts_log_warning(fw, "Can't use linux performance "
> + "counters (perf), falling back to "
> + "relative measurements");
> + warned = true;
> + }
> + perf_ok = false;
> + }
> +
> +
> /* Pin to the specified CPU */
>
> if (sched_getaffinity(0, sizeof(oldset), &oldset) < 0) {
> @@ -352,6 +423,9 @@ int fwts_cpu_performance(
> while (current == time(NULL))
> sched_yield();
>
> + if (perf_ok)
> + perf_start_counter(perfctr);
> +
> current = time(NULL);
>
> /*
> @@ -360,17 +434,38 @@ int fwts_cpu_performance(
> */
> do {
> fwts_cpu_burn_cycles();
> - (*loop_count)++;
> + tmp.loops++;
> } while (current == time(NULL));
>
> + if (perf_ok)
> + perf_stop_counter(perfctr);
> +
> if (sched_setaffinity(0, sizeof(oldset), &oldset) < 0) {
> fwts_log_error(fw, "Cannot restore old CPU affinity settings.");
> return FWTS_ERROR;
> }
>
> + if (perf_ok) {
> + rc = perf_read_counter(perfctr, &perfctr_result);
> + if (rc == FWTS_OK) {
> + tmp.cycles = perfctr_result;
> + tmp.cycles_valid = true;
> + } else {
> + fwts_log_warning(fw, "failed to read perf counters");
> + }
> +
> + }
> +
> + *result = tmp;
> +
> return FWTS_OK;
> }
>
> +uint64_t fwts_cpu_benchmark_best_result(fwts_cpu_benchmark_result *res)
> +{
> + return res->cycles_valid ? res->cycles : res->loops;
> +}
> +
> /*
> * fwts_cpu_consume_cycles()
> * eat up CPU cycles
>
Acked-by: Colin Ian King <colin.king at canonical.com
More information about the fwts-devel
mailing list