blob: 61a7696b334cf465857353dadb77aa4a1f7f3cea [file] [log] [blame]
#include <emmintrin.h>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include "../common/benchmark.h"
#include "../common/halide_image.h"
#include "../common/halide_image_io.h"
#include "halide_blur.h"
using namespace Halide::Tools;
double t;
Image<uint16_t> blur(Image<uint16_t> in) {
Image<uint16_t> tmp(in.width()-8, in.height());
Image<uint16_t> out(in.width()-8, in.height()-2);
t = benchmark(10, 1, [&]() {
for (int y = 0; y < tmp.height(); y++)
for (int x = 0; x < tmp.width(); x++)
tmp(x, y) = (in(x, y) + in(x+1, y) + in(x+2, y))/3;
for (int y = 0; y < out.height(); y++)
for (int x = 0; x < out.width(); x++)
out(x, y) = (tmp(x, y) + tmp(x, y+1) + tmp(x, y+2))/3;
return out;
Image<uint16_t> blur_fast(Image<uint16_t> in) {
Image<uint16_t> out(in.width()-8, in.height()-2);
t = benchmark(10, 1, [&]() {
__m128i one_third = _mm_set1_epi16(21846);
#pragma omp parallel for
for (int yTile = 0; yTile < out.height(); yTile += 32) {
__m128i a, b, c, sum, avg;
__m128i tmp[(128/8) * (32 + 2)];
for (int xTile = 0; xTile < out.width(); xTile += 128) {
__m128i *tmpPtr = tmp;
for (int y = 0; y < 32+2; y++) {
const uint16_t *inPtr = &(in(xTile, yTile+y));
for (int x = 0; x < 128; x += 8) {
a = _mm_load_si128((__m128i*)(inPtr));
b = _mm_loadu_si128((__m128i*)(inPtr+1));
c = _mm_loadu_si128((__m128i*)(inPtr+2));
sum = _mm_add_epi16(_mm_add_epi16(a, b), c);
avg = _mm_mulhi_epi16(sum, one_third);
_mm_store_si128(tmpPtr++, avg);
tmpPtr = tmp;
for (int y = 0; y < 32; y++) {
__m128i *outPtr = (__m128i *)(&(out(xTile, yTile+y)));
for (int x = 0; x < 128; x += 8) {
a = _mm_load_si128(tmpPtr+(2*128)/8);
b = _mm_load_si128(tmpPtr+128/8);
c = _mm_load_si128(tmpPtr++);
sum = _mm_add_epi16(_mm_add_epi16(a, b), c);
avg = _mm_mulhi_epi16(sum, one_third);
_mm_store_si128(outPtr++, avg);
return out;
Image<uint16_t> blur_halide(Image<uint16_t> in) {
Image<uint16_t> out(in.width()-8, in.height()-2);
// Call it once to initialize the halide runtime stuff
halide_blur(in, out);
t = benchmark(10, 1, [&]() {
// Compute the same region of the output as blur_fast (i.e., we're
// still being sloppy with boundary conditions)
halide_blur(in, out);
return out;
int main(int argc, char **argv) {
Image<uint16_t> input(6408, 4802);
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
input(x, y) = rand() & 0xfff;
Image<uint16_t> blurry = blur(input);
double slow_time = t;
Image<uint16_t> speedy = blur_fast(input);
double fast_time = t;
Image<uint16_t> halide = blur_halide(input);
double halide_time = t;
printf("times: %f %f %f\n", slow_time, fast_time, halide_time);
for (int y = 64; y < input.height() - 64; y++) {
for (int x = 64; x < input.width() - 64; x++) {
if (blurry(x, y) != speedy(x, y) || blurry(x, y) != halide(x, y))
printf("difference at (%d,%d): %d %d %d\n", x, y, blurry(x, y), speedy(x, y), halide(x, y));
return 0;