401 lines
12 KiB
C++
401 lines
12 KiB
C++
|
/*
|
||
|
* Copyright (C) 2015 Christopher Gilbert.
|
||
|
*
|
||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
|
* of this software and associated documentation files (the "Software"), to deal
|
||
|
* in the Software without restriction, including without limitation the rights
|
||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
|
* copies of the Software, and to permit persons to whom the Software is
|
||
|
* furnished to do so, subject to the following conditions:
|
||
|
*
|
||
|
* The above copyright notice and this permission notice shall be included in all
|
||
|
* copies or substantial portions of the Software.
|
||
|
*
|
||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||
|
* SOFTWARE.
|
||
|
*/
|
||
|
#ifndef BENCHPRESS_HPP
|
||
|
#define BENCHPRESS_HPP
|
||
|
|
||
|
#include <algorithm> // max, min
|
||
|
#include <atomic> // atomic_intmax_t
|
||
|
#include <chrono> // high_resolution_timer, duration
|
||
|
#include <functional> // function
|
||
|
#include <iomanip> // setw
|
||
|
#include <iostream> // cout
|
||
|
#include <regex> // regex, regex_match
|
||
|
#include <sstream> // stringstream
|
||
|
#include <string> // string
|
||
|
#include <thread> // thread
|
||
|
#include <vector> // vector
|
||
|
|
||
|
namespace benchpress {
|
||
|
|
||
|
/*
|
||
|
* The options class encapsulates all options for running benchmarks.
|
||
|
*
|
||
|
* When including benchpress, a main function can be emitted which includes a command-line parser for building an
|
||
|
* options object. However from time-to-time it may be necessary for the developer to have to build their own main
|
||
|
* stub and construct the options object manually.
|
||
|
*
|
||
|
* options opts;
|
||
|
* opts
|
||
|
* .bench(".*")
|
||
|
* .benchtime(1)
|
||
|
* .cpu(4);
|
||
|
*/
|
||
|
class options {
|
||
|
std::string d_bench;
|
||
|
size_t d_benchtime;
|
||
|
size_t d_cpu;
|
||
|
public:
|
||
|
options()
|
||
|
: d_bench(".*")
|
||
|
, d_benchtime(1)
|
||
|
, d_cpu(std::thread::hardware_concurrency())
|
||
|
{}
|
||
|
options& bench(const std::string& bench) {
|
||
|
d_bench = bench;
|
||
|
return *this;
|
||
|
}
|
||
|
options& benchtime(size_t benchtime) {
|
||
|
d_benchtime = benchtime;
|
||
|
return *this;
|
||
|
}
|
||
|
options& cpu(size_t cpu) {
|
||
|
d_cpu = cpu;
|
||
|
return *this;
|
||
|
}
|
||
|
std::string get_bench() const {
|
||
|
return d_bench;
|
||
|
}
|
||
|
size_t get_benchtime() const {
|
||
|
return d_benchtime;
|
||
|
}
|
||
|
size_t get_cpu() const {
|
||
|
return d_cpu;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
class context;
|
||
|
|
||
|
/*
|
||
|
* The benchmark_info class is used to store a function name / pointer pair.
|
||
|
*
|
||
|
* benchmark_info bi("example", [](benchpress::context* b) {
|
||
|
* // benchmark function
|
||
|
* });
|
||
|
*/
|
||
|
class benchmark_info {
|
||
|
std::string d_name;
|
||
|
std::function<void(context*)> d_func;
|
||
|
|
||
|
public:
|
||
|
benchmark_info(std::string name, std::function<void(context*)> func)
|
||
|
: d_name(name)
|
||
|
, d_func(func)
|
||
|
{}
|
||
|
|
||
|
std::string get_name() const { return d_name; }
|
||
|
std::function<void(context*)> get_func() const { return d_func; }
|
||
|
};
|
||
|
|
||
|
/*
|
||
|
* The registration class is responsible for providing a single global point of reference for registering
|
||
|
* benchmark functions.
|
||
|
*
|
||
|
* registration::get_ptr()->register_benchmark(info);
|
||
|
*/
|
||
|
class registration {
|
||
|
static registration* d_this;
|
||
|
std::vector<benchmark_info> d_benchmarks;
|
||
|
|
||
|
public:
|
||
|
static registration* get_ptr() {
|
||
|
if (nullptr == d_this) {
|
||
|
d_this = new registration();
|
||
|
}
|
||
|
return d_this;
|
||
|
}
|
||
|
|
||
|
void register_benchmark(benchmark_info& info) {
|
||
|
d_benchmarks.push_back(info);
|
||
|
}
|
||
|
|
||
|
std::vector<benchmark_info> get_benchmarks() { return d_benchmarks; }
|
||
|
};
|
||
|
|
||
|
/*
|
||
|
* The auto_register class is a helper used to register benchmarks.
|
||
|
*/
|
||
|
class auto_register {
|
||
|
public:
|
||
|
auto_register(const std::string& name, std::function<void(context*)> func) {
|
||
|
benchmark_info info(name, func);
|
||
|
registration::get_ptr()->register_benchmark(info);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
#define CONCAT(x, y) x ## y
|
||
|
#define CONCAT2(x, y) CONCAT(x, y)
|
||
|
|
||
|
// The BENCHMARK macro is a helper for creating benchmark functions and automatically registering them with the
|
||
|
// registration class.
|
||
|
#define BENCHMARK(x, f) benchpress::auto_register CONCAT2(register_, __LINE__)((x), (f));
|
||
|
|
||
|
// This macro will prevent the compiler from removing a redundant code path which has no side-effects.
|
||
|
#define DISABLE_REDUNDANT_CODE_OPT() { asm(""); }
|
||
|
|
||
|
/*
|
||
|
* The result class is responsible for producing a printable string representation of a benchmark run.
|
||
|
*/
|
||
|
class result {
|
||
|
size_t d_num_iterations;
|
||
|
std::chrono::nanoseconds d_duration;
|
||
|
size_t d_num_bytes;
|
||
|
|
||
|
public:
|
||
|
result(size_t num_iterations, std::chrono::nanoseconds duration, size_t num_bytes)
|
||
|
: d_num_iterations(num_iterations)
|
||
|
, d_duration(duration)
|
||
|
, d_num_bytes(num_bytes)
|
||
|
{}
|
||
|
|
||
|
size_t get_ns_per_op() const {
|
||
|
if (d_num_iterations <= 0) {
|
||
|
return 0;
|
||
|
}
|
||
|
return d_duration.count() / d_num_iterations;
|
||
|
}
|
||
|
|
||
|
double get_mb_per_s() const {
|
||
|
if (d_num_iterations <= 0 || d_duration.count() <= 0 || d_num_bytes <= 0) {
|
||
|
return 0;
|
||
|
}
|
||
|
return ((double(d_num_bytes) * double(d_num_iterations) / double(1e6)) /
|
||
|
double(std::chrono::duration_cast<std::chrono::seconds>(d_duration).count()));
|
||
|
}
|
||
|
|
||
|
std::string to_string() const {
|
||
|
std::stringstream tmp;
|
||
|
tmp << std::setw(12) << std::right << d_num_iterations;
|
||
|
size_t npo = get_ns_per_op();
|
||
|
tmp << std::setw(12) << std::right << npo << std::setw(0) << " ns/op";
|
||
|
double mbs = get_mb_per_s();
|
||
|
if (mbs > 0.0) {
|
||
|
tmp << std::setw(12) << std::right << mbs << std::setw(0) << " MB/s";
|
||
|
}
|
||
|
return std::string(tmp.str());
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/*
|
||
|
* The parallel_context class is responsible for providing a thread-safe context for parallel benchmark code.
|
||
|
*/
|
||
|
class parallel_context {
|
||
|
std::atomic_intmax_t d_num_iterations;
|
||
|
public:
|
||
|
parallel_context(size_t num_iterations)
|
||
|
: d_num_iterations(num_iterations)
|
||
|
{}
|
||
|
|
||
|
bool next() {
|
||
|
return (d_num_iterations.fetch_sub(1) > 0);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/*
|
||
|
* The context class is responsible for providing an interface for capturing benchmark metrics to benchmark functions.
|
||
|
*/
|
||
|
class context {
|
||
|
bool d_timer_on;
|
||
|
std::chrono::high_resolution_clock::time_point d_start;
|
||
|
std::chrono::nanoseconds d_duration;
|
||
|
std::chrono::seconds d_benchtime;
|
||
|
size_t d_num_iterations;
|
||
|
size_t d_num_threads;
|
||
|
size_t d_num_bytes;
|
||
|
benchmark_info d_benchmark;
|
||
|
|
||
|
public:
|
||
|
context(const benchmark_info& info, const options& opts)
|
||
|
: d_timer_on(false)
|
||
|
, d_start()
|
||
|
, d_duration()
|
||
|
, d_benchtime(std::chrono::seconds(opts.get_benchtime()))
|
||
|
, d_num_iterations(1)
|
||
|
, d_num_threads(opts.get_cpu())
|
||
|
, d_num_bytes(0)
|
||
|
, d_benchmark(info)
|
||
|
{}
|
||
|
|
||
|
size_t num_iterations() const { return d_num_iterations; }
|
||
|
|
||
|
void set_num_threads(size_t n) { d_num_threads = n; }
|
||
|
size_t num_threads() const { return d_num_threads; }
|
||
|
|
||
|
void start_timer() {
|
||
|
if (!d_timer_on) {
|
||
|
d_start = std::chrono::high_resolution_clock::now();
|
||
|
d_timer_on = true;
|
||
|
}
|
||
|
}
|
||
|
void stop_timer() {
|
||
|
if (d_timer_on) {
|
||
|
d_duration += std::chrono::high_resolution_clock::now() - d_start;
|
||
|
d_timer_on = false;
|
||
|
}
|
||
|
}
|
||
|
void reset_timer() {
|
||
|
if (d_timer_on) {
|
||
|
d_start = std::chrono::high_resolution_clock::now();
|
||
|
}
|
||
|
d_duration = std::chrono::nanoseconds::zero();
|
||
|
}
|
||
|
|
||
|
void set_bytes(int64_t bytes) { d_num_bytes = bytes; }
|
||
|
|
||
|
size_t get_ns_per_op() {
|
||
|
if (d_num_iterations <= 0) {
|
||
|
return 0;
|
||
|
}
|
||
|
return d_duration.count() / d_num_iterations;
|
||
|
}
|
||
|
|
||
|
void run_n(size_t n) {
|
||
|
d_num_iterations = n;
|
||
|
reset_timer();
|
||
|
start_timer();
|
||
|
d_benchmark.get_func()(this);
|
||
|
stop_timer();
|
||
|
}
|
||
|
|
||
|
void run_parallel(std::function<void(parallel_context*)> f) {
|
||
|
parallel_context pc(d_num_iterations);
|
||
|
std::vector<std::thread> threads;
|
||
|
for (size_t i = 0; i < d_num_threads; ++i) {
|
||
|
threads.push_back(std::thread([&pc,&f]() -> void {
|
||
|
f(&pc);
|
||
|
}));
|
||
|
}
|
||
|
for(auto& thread : threads){
|
||
|
thread.join();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
result run() {
|
||
|
size_t n = 1;
|
||
|
run_n(n);
|
||
|
while (d_duration < d_benchtime && n < 1e9) {
|
||
|
size_t last = n;
|
||
|
if (get_ns_per_op() == 0) {
|
||
|
n = 1e9;
|
||
|
} else {
|
||
|
n = d_duration.count() / get_ns_per_op();
|
||
|
}
|
||
|
n = std::max(std::min(n+n/2, 100*last), last+1);
|
||
|
n = round_up(n);
|
||
|
run_n(n);
|
||
|
}
|
||
|
return result(n, d_duration, d_num_bytes);
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
template<typename T>
|
||
|
T round_down_10(T n) {
|
||
|
int tens = 0;
|
||
|
while (n > 10) {
|
||
|
n /= 10;
|
||
|
tens++;
|
||
|
}
|
||
|
int result = 1;
|
||
|
for (int i = 0; i < tens; ++i) {
|
||
|
result *= 10;
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
template<typename T>
|
||
|
T round_up(T n) {
|
||
|
T base = round_down_10(n);
|
||
|
if (n < (2 * base)) {
|
||
|
return 2 * base;
|
||
|
}
|
||
|
if (n < (5 * base)) {
|
||
|
return 5 * base;
|
||
|
}
|
||
|
return 10 * base;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/*
|
||
|
* The run_benchmarks function will run the registered benchmarks.
|
||
|
*/
|
||
|
void run_benchmarks(const options& opts) {
|
||
|
std::regex match_r(opts.get_bench());
|
||
|
auto benchmarks = registration::get_ptr()->get_benchmarks();
|
||
|
for (auto& info : benchmarks) {
|
||
|
if (std::regex_match(info.get_name(), match_r)) {
|
||
|
context c(info, opts);
|
||
|
auto r = c.run();
|
||
|
std::cout << std::setw(35) << std::left << info.get_name() << r.to_string() << std::endl;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
} // namespace benchpress
|
||
|
|
||
|
/*
|
||
|
* If BENCHPRESS_CONFIG_MAIN is defined when the file is included then a main function will be emitted which provides a
|
||
|
* command-line parser and then executes run_benchmarks.
|
||
|
*/
|
||
|
#ifdef BENCHPRESS_CONFIG_MAIN
|
||
|
#include "cxxopts.hpp"
|
||
|
benchpress::registration* benchpress::registration::d_this;
|
||
|
int main(int argc, char** argv) {
|
||
|
std::chrono::high_resolution_clock::time_point bp_start = std::chrono::high_resolution_clock::now();
|
||
|
benchpress::options bench_opts;
|
||
|
try {
|
||
|
cxxopts::Options cmd_opts(argv[0], " - command line options");
|
||
|
cmd_opts.add_options()
|
||
|
("bench", "run benchmarks matching the regular expression", cxxopts::value<std::string>()
|
||
|
->default_value(".*"))
|
||
|
("benchtime", "run enough iterations of each benchmark to take t seconds", cxxopts::value<size_t>()
|
||
|
->default_value("1"))
|
||
|
("cpu", "specify the number of threads to use for parallel benchmarks", cxxopts::value<size_t>()
|
||
|
->default_value(std::to_string(std::thread::hardware_concurrency())))
|
||
|
("help", "print help")
|
||
|
;
|
||
|
cmd_opts.parse(argc, argv);
|
||
|
if (cmd_opts.count("help")) {
|
||
|
std::cout << cmd_opts.help({""}) << std::endl;
|
||
|
exit(0);
|
||
|
}
|
||
|
if (cmd_opts.count("bench")) {
|
||
|
bench_opts.bench(cmd_opts["bench"].as<std::string>());
|
||
|
}
|
||
|
if (cmd_opts.count("benchtime")) {
|
||
|
bench_opts.benchtime(cmd_opts["benchtime"].as<size_t>());
|
||
|
}
|
||
|
if (cmd_opts.count("cpu")) {
|
||
|
bench_opts.cpu(cmd_opts["cpu"].as<size_t>());
|
||
|
}
|
||
|
} catch (const cxxopts::OptionException& e) {
|
||
|
std::cout << "error parsing options: " << e.what() << std::endl;
|
||
|
exit(1);
|
||
|
}
|
||
|
benchpress::run_benchmarks(bench_opts);
|
||
|
float duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||
|
std::chrono::high_resolution_clock::now() - bp_start
|
||
|
).count() / 1000.f;
|
||
|
std::cout << argv[0] << " " << duration << "s" << std::endl;
|
||
|
return 0;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#endif // BENCHPRESS_HPP
|