blob: d8a5b957223ee4b924edfe102f89a23bc5de9858 [file] [log] [blame]
// Shared bench marking code
#pragma once
#include <inttypes.h>
#include <stdio.h>
#if defined(__AVR__)
#include "avr_type_helpers.h"
#else
#include <algorithm>
#include <limits>
#include <type_traits>
#endif
#undef UNUSED
#define UNUSED(x) (void)(x)
#if defined(_WIN32) || defined(WIN32)
#define NOMINMAX
#define WIN32_LEAN_AND_MEAN
#define VC_EXTRALEAN
#include <windows.h>
#define LIBDIVIDE_WINDOWS
#endif
#include "libdivide.h"
#include "outputs.h"
#include "random_numerators.hpp"
#include "timer.hpp"
#include "type_mappings.h"
using namespace libdivide;
#if defined(__GNUC__)
#define NOINLINE __attribute__((__noinline__))
#elif defined(_MSC_VER)
#define NOINLINE __declspec(noinline)
#pragma warning(disable : 4146)
#else
#define NOINLINE
#endif
#if defined(LIBDIVIDE_AVX512)
#define x86_VECTOR_TYPE __m512i
#define SETZERO_SI _mm512_setzero_si512
#define LOAD_SI _mm512_load_si512
#define ADD_EPI64 _mm512_add_epi64
#define ADD_EPI32 _mm512_add_epi32
#define ADD_EPI16 _mm512_add_epi16
#elif defined(LIBDIVIDE_AVX2)
#define x86_VECTOR_TYPE __m256i
#define SETZERO_SI _mm256_setzero_si256
#define LOAD_SI _mm256_load_si256
#define ADD_EPI64 _mm256_add_epi64
#define ADD_EPI32 _mm256_add_epi32
#define ADD_EPI16 _mm256_add_epi16
#elif defined(LIBDIVIDE_SSE2)
#define x86_VECTOR_TYPE __m128i
#define SETZERO_SI _mm_setzero_si128
#define LOAD_SI _mm_load_si128
#define ADD_EPI64 _mm_add_epi64
#define ADD_EPI32 _mm_add_epi32
#define ADD_EPI16 _mm_add_epi16
#endif
// Helper - given a vector of some type, convert it to unsigned and sum it.
// This is factored out in this funny way to avoid signed integer overflow.
template <typename IntT>
inline uint64_t unsigned_sum_vals(const IntT *vals, size_t count) {
typedef typename std::make_unsigned<IntT>::type UIntT;
UIntT sum = 0;
for (size_t i = 0; i < count; i++) {
sum += static_cast<UIntT>(vals[i]);
}
return sum;
}
template <typename IntT, typename Divisor>
NOINLINE uint64_t sum_quotients(const random_numerators<IntT> &vals, const Divisor &div) {
// Need to use unsigned to avoid signed integer overflow.
typedef typename std::make_unsigned<IntT>::type UIntT;
UIntT sum = 0;
for (auto iter = vals.begin(); iter != vals.end(); ++iter) {
sum += (UIntT)(*iter / div);
}
return (uint64_t)sum;
}
#ifdef x86_VECTOR_TYPE
template <size_t IntSize>
inline x86_VECTOR_TYPE add_vector(x86_VECTOR_TYPE sumX4, x86_VECTOR_TYPE numers) {
UNUSED(numers);
abort();
return sumX4;
}
template <>
inline x86_VECTOR_TYPE add_vector<2U>(x86_VECTOR_TYPE sumX4, x86_VECTOR_TYPE numers) {
return ADD_EPI16(sumX4, numers);
}
template <>
inline x86_VECTOR_TYPE add_vector<4U>(x86_VECTOR_TYPE sumX4, x86_VECTOR_TYPE numers) {
return ADD_EPI32(sumX4, numers);
}
template <>
inline x86_VECTOR_TYPE add_vector<8U>(x86_VECTOR_TYPE sumX4, x86_VECTOR_TYPE numers) {
return ADD_EPI64(sumX4, numers);
}
template <typename IntT, typename Divisor>
NOINLINE uint64_t sum_quotients_vec(const random_numerators<IntT> &vals, const Divisor &div) {
size_t count = sizeof(x86_VECTOR_TYPE) / sizeof(IntT);
x86_VECTOR_TYPE sumX4 = SETZERO_SI();
for (auto iter = vals.begin(); iter != vals.end(); iter += count) {
x86_VECTOR_TYPE numers = LOAD_SI((const x86_VECTOR_TYPE *)iter);
numers = numers / div;
sumX4 = add_vector<sizeof(IntT)>(sumX4, numers);
}
return unsigned_sum_vals((const IntT *)&sumX4, count);
}
#elif defined(LIBDIVIDE_NEON)
// Helper to deduce NEON vector type for integral type.
template <typename T>
struct NeonVecFuncs {};
template <>
struct NeonVecFuncs<uint16_t> {
static inline uint16x8_t dup(uint16_t value) { return vdupq_n_u16(value); }
static inline uint16x8_t add(uint16x8_t a, uint16x8_t b) { return vaddq_u16(a, b); }
};
template <>
struct NeonVecFuncs<int16_t> {
static inline int16x8_t dup(int16_t value) { return vdupq_n_s16(value); }
static inline int16x8_t add(int16x8_t a, int16x8_t b) { return vaddq_s16(a, b); }
};
template <>
struct NeonVecFuncs<uint32_t> {
static inline uint32x4_t dup(uint32_t value) { return vdupq_n_u32(value); }
static inline uint32x4_t add(uint32x4_t a, uint32x4_t b) { return vaddq_u32(a, b); }
};
template <>
struct NeonVecFuncs<int32_t> {
static inline int32x4_t dup(int32_t value) { return vdupq_n_s32(value); }
static inline int32x4_t add(int32x4_t a, int32x4_t b) { return vaddq_s32(a, b); }
};
template <>
struct NeonVecFuncs<uint64_t> {
static inline uint64x2_t dup(uint64_t value) { return vdupq_n_u64(value); }
static inline uint64x2_t add(uint64x2_t a, uint64x2_t b) { return vaddq_u64(a, b); }
};
template <>
struct NeonVecFuncs<int64_t> {
static inline int64x2_t dup(int64_t value) { return vdupq_n_s64(value); }
static inline int64x2_t add(int64x2_t a, int64x2_t b) { return vaddq_s64(a, b); }
};
template <typename IntT, typename Divisor>
NOINLINE uint64_t sum_quotients_vec(const random_numerators<IntT> &vals, const Divisor &div) {
typedef typename NeonVecFor<IntT>::type NeonVectorType;
size_t count = sizeof(NeonVectorType) / sizeof(IntT);
NeonVectorType sumX4 = NeonVecFuncs<IntT>::dup(0);
for (auto iter = vals.begin(); iter != vals.end(); iter += count) {
NeonVectorType numers = *(NeonVectorType *)iter;
numers = numers / div;
sumX4 = NeonVecFuncs<IntT>::add(sumX4, numers);
}
return unsigned_sum_vals((const IntT *)&sumX4, count);
}
#endif
// noinline to force compiler to emit this
template <typename IntT>
NOINLINE divider<IntT> generate_1_divisor(IntT d) {
return divider<IntT>(d);
}
template <typename IntT>
NOINLINE void generate_divisor(const random_numerators<IntT> &vals, IntT denom) {
for (size_t iter = 0; iter < vals.length(); iter++) {
(void)generate_1_divisor(denom);
}
}
struct time_double {
uint64_t time; // in nanoseconds
uint64_t result;
};
template <typename IntT, class DenomT>
using pFuncToTime = uint64_t (*)(const random_numerators<IntT> &, const DenomT &);
template <typename IntT, class DenomT>
NOINLINE static time_double time_function(
const random_numerators<IntT> &vals, DenomT denom, pFuncToTime<IntT, DenomT> timeFunc) {
time_double tresult;
timer t;
t.start();
tresult.result = timeFunc(vals, denom);
t.stop();
tresult.time = t.duration_nano();
return tresult;
}
struct TestResult {
double hardware_time;
double base_time;
double branchfree_time;
double vector_time;
double vector_branchfree_time;
double gen_time;
int algo;
};
#define TEST_COUNT 30
inline void check_result(uint64_t expected, uint64_t actual, uint32_t line_no) {
if ((actual) != (expected)) {
PRINT_ERROR("Failure on line ");
PRINT_ERROR(line_no);
PRINT_ERROR("\n");
}
}
template <typename IntT>
NOINLINE TestResult test_one(const random_numerators<IntT> &vals, IntT denom) {
const bool testBranchfree = (denom != 1);
divider<IntT, BRANCHFULL> div_bfull(denom);
divider<IntT, BRANCHFREE> div_bfree(testBranchfree ? denom : 2);
uint64_t min_my_time = INT64_MAX, min_my_time_branchfree = INT64_MAX,
min_my_time_vector = INT64_MAX, min_my_time_vector_branchfree = INT64_MAX,
min_his_time = INT64_MAX, min_gen_time = INT64_MAX;
time_double tresult;
for (size_t iter = 0; iter < TEST_COUNT; iter++) {
tresult = time_function(vals, denom, sum_quotients);
min_his_time = (std::min)(min_his_time, tresult.time);
const uint64_t expected = tresult.result;
tresult = time_function(vals, div_bfull, sum_quotients);
min_my_time = (std::min)(min_my_time, tresult.time);
check_result(tresult.result, expected, __LINE__);
if (testBranchfree) {
tresult = time_function(vals, div_bfree, sum_quotients);
min_my_time_branchfree = (std::min)(min_my_time_branchfree, tresult.time);
check_result(tresult.result, expected, __LINE__);
}
#if defined(x86_VECTOR_TYPE) || defined(LIBDIVIDE_NEON)
tresult = time_function(vals, div_bfull, sum_quotients_vec);
min_my_time_vector = (std::min)(min_my_time_vector, tresult.time);
check_result(tresult.result, expected, __LINE__);
if (testBranchfree) {
tresult = time_function(vals, div_bfree, sum_quotients_vec);
min_my_time_vector_branchfree = (std::min)(min_my_time_vector_branchfree, tresult.time);
check_result(tresult.result, expected, __LINE__);
}
#else
min_my_time_vector = 0;
min_my_time_vector_branchfree = 0;
#endif
{
timer t;
t.start();
generate_divisor(vals, denom);
t.stop();
min_gen_time = (std::min)(min_gen_time, t.duration_nano());
}
}
TestResult result;
result.gen_time = min_gen_time / (double)vals.length();
result.base_time = min_my_time / (double)vals.length();
result.branchfree_time = testBranchfree ? min_my_time_branchfree / (double)vals.length() : -1;
result.vector_time = min_my_time_vector / (double)vals.length();
result.vector_branchfree_time =
testBranchfree ? min_my_time_vector_branchfree / (double)vals.length() : -1;
result.hardware_time = min_his_time / (double)vals.length();
return result;
}
template <typename _IntT>
int32_t get_algorithm(_IntT d) {
const auto denom = libdivide_gen(d);
uint8_t more = denom.more;
if (!denom.magic)
return 0;
else if (!(more & LIBDIVIDE_ADD_MARKER))
return 1;
else
return 2;
}
template <typename _IntT>
NOINLINE TestResult test_one(_IntT d, const random_numerators<_IntT> &data) {
struct TestResult result = test_one(data, d);
result.algo = get_algorithm(d);
return result;
}
// Result column width
#define PRIcw "10"
inline static void print_report_header(void) {
char buffer[256];
snprintf(buffer, sizeof buffer,
"%6s %" PRIcw "s %" PRIcw "s %" PRIcw "s %" PRIcw "s %" PRIcw "s %" PRIcw "s %6s\n", "#",
"system", "scalar", "scl_bf", "vector", "vec_bf", "gener", "algo");
PRINT_INFO(buffer);
}
// Result column format spec
#define PRIrc PRIcw ".3f"
template <typename _IntT>
static void print_report_result(_IntT d, struct TestResult result) {
char denom_buff[32];
char *pDenom = to_str(denom_buff, d);
char report_buff[256];
snprintf(report_buff, sizeof report_buff,
"%6s %" PRIrc " %" PRIrc " %" PRIrc " %" PRIrc " %" PRIrc " %" PRIrc " %4d\n", pDenom,
result.hardware_time, result.base_time, result.branchfree_time, result.vector_time,
result.vector_branchfree_time, result.gen_time, result.algo);
PRINT_INFO(report_buff);
}
template <typename _IntT>
static void print_banner() {
char type_buffer[64];
snprintf(type_buffer, sizeof type_buffer,
"=== libdivide %s benchmark ===", type_tag<_IntT>::get_tag());
char buffer[128];
snprintf(buffer, sizeof buffer, "\n%50s\n\n", type_buffer);
PRINT_INFO(buffer);
}
template <typename _IntT>
void test_many() {
print_banner<_IntT>();
print_report_header();
random_numerators<_IntT> data;
_IntT d = 1;
while (true) {
print_report_result(d, test_one(d, data));
if (std::numeric_limits<_IntT>::is_signed) {
d = -d;
if (d > 0) d++;
} else {
d++;
}
}
}