libavfilter/opencl/deshake.cl - third_party/ffmpeg - Git at Google

 /*
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * Copyright (C) 2000, Intel Corporation, all rights reserved.
  * Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  * Third party copyrights are property of their respective owners.
  *
  * Redistribution and use in source and binary forms, with or without modification,
  * are permitted provided that the following conditions are met:
  *
  *   * Redistribution's of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *   * Redistribution's in binary form must reproduce the above copyright notice,
  *     this list of conditions and the following disclaimer in the documentation
  *     and/or other materials provided with the distribution.
  *
  *   * The name of the copyright holders may not be used to endorse or promote products
  *     derived from this software without specific prior written permission.
  *
  * This software is provided by the copyright holders and contributors "as is" and
  * any express or implied warranties, including, but not limited to, the implied
  * warranties of merchantability and fitness for a particular purpose are disclaimed.
  * In no event shall the Intel Corporation or contributors be liable for any direct,
  * indirect, incidental, special, exemplary, or consequential damages
  * (including, but not limited to, procurement of substitute goods or services;
  * loss of use, data, or profits; or business interruption) however caused
  * and on any theory of liability, whether in contract, strict liability,
  * or tort (including negligence or otherwise) arising in any way out of
  * the use of this software, even if advised of the possibility of such damage.
  */

 #define HARRIS_THRESHOLD 3.0f
 // Block size over which to compute harris response
 //
 // Note that changing this will require fiddling with the local array sizes in
 // harris_response
 #define HARRIS_RADIUS 2
 #define DISTANCE_THRESHOLD 80

 // Sub-pixel refinement window for feature points
 #define REFINE_WIN_HALF_W 5
 #define REFINE_WIN_HALF_H 5
 #define REFINE_WIN_W 11 // REFINE_WIN_HALF_W * 2 + 1
 #define REFINE_WIN_H 11

 // Non-maximum suppression window size
 #define NONMAX_WIN 30
 #define NONMAX_WIN_HALF 15 // NONMAX_WIN / 2

 typedef struct PointPair {
     // Previous frame
     float2 p1;
     // Current frame
     float2 p2;
 } PointPair;

 typedef struct SmoothedPointPair {
     // Non-smoothed point in current frame
     int2 p1;
     // Smoothed point in current frame
     float2 p2;
 } SmoothedPointPair;

 typedef struct MotionVector {
     PointPair p;
     // Used to mark vectors as potential outliers
     int should_consider;
 } MotionVector;

 const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
                           CLK_ADDRESS_CLAMP_TO_EDGE |
                           CLK_FILTER_NEAREST;

 const sampler_t sampler_linear = CLK_NORMALIZED_COORDS_FALSE |
                           CLK_ADDRESS_CLAMP_TO_EDGE |
                           CLK_FILTER_LINEAR;

 const sampler_t sampler_linear_mirror = CLK_NORMALIZED_COORDS_TRUE |
                           CLK_ADDRESS_MIRRORED_REPEAT |
                           CLK_FILTER_LINEAR;

 // Writes to a 1D array at loc, treating it as a 2D array with the same
 // dimensions as the global work size.
 static void write_to_1d_arrf(__global float *buf, int2 loc, float val) {
     buf[loc.x + loc.y * get_global_size(0)] = val;
 }

 static void write_to_1d_arrul8(__global ulong8 *buf, int2 loc, ulong8 val) {
     buf[loc.x + loc.y * get_global_size(0)] = val;
 }

 static void write_to_1d_arrvec(__global MotionVector *buf, int2 loc, MotionVector val) {
     buf[loc.x + loc.y * get_global_size(0)] = val;
 }

 static void write_to_1d_arrf2(__global float2 *buf, int2 loc, float2 val) {
     buf[loc.x + loc.y * get_global_size(0)] = val;
 }

 static ulong8 read_from_1d_arrul8(__global const ulong8 *buf, int2 loc) {
     return buf[loc.x + loc.y * get_global_size(0)];
 }

 static float2 read_from_1d_arrf2(__global const float2 *buf, int2 loc) {
     return buf[loc.x + loc.y * get_global_size(0)];
 }

 // Returns the grayscale value at the given point.
 static float pixel_grayscale(__read_only image2d_t src, int2 loc) {
     float4 pixel = read_imagef(src, sampler, loc);
     return (pixel.x + pixel.y + pixel.z) / 3.0f;
 }

 static float convolve(
     __local const float *grayscale,
     int local_idx_x,
     int local_idx_y,
     float mask[3][3]
 ) {
     float ret = 0;

     // These loops touch each pixel surrounding loc as well as loc itself
     for (int i = 1, i2 = 0; i >= -1; --i, ++i2) {
         for (int j = -1, j2 = 0; j <= 1; ++j, ++j2) {
             ret += mask[i2][j2] * grayscale[(local_idx_x + 3 + j) + (local_idx_y + 3 + i) * 14];
         }
     }

     return ret;
 }

 // Sums dx * dy for all pixels within radius of loc
 static float sum_deriv_prod(
     __local const float *grayscale,
     float mask_x[3][3],
     float mask_y[3][3]
 ) {
     float ret = 0;

     for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) {
         for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) {
             ret += convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_x) *
                    convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_y);
         }
     }

     return ret;
 }

 // Sums d<>^2 (determined by mask) for all pixels within radius of loc
 static float sum_deriv_pow(__local const float *grayscale, float mask[3][3])
 {
     float ret = 0;

     for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) {
         for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) {
             float deriv = convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask);
             ret += deriv * deriv;
         }
     }

     return ret;
 }

 // Fills a box with the given radius and pixel around loc
 static void draw_box(__write_only image2d_t dst, int2 loc, float4 pixel, int radius)
 {
     for (int i = -radius; i <= radius; ++i) {
         for (int j = -radius; j <= radius; ++j) {
             write_imagef(
                 dst,
                 (int2)(
                     // Clamp to avoid writing outside image bounds
                     clamp(loc.x + i, 0, get_image_dim(dst).x - 1),
                     clamp(loc.y + j, 0, get_image_dim(dst).y - 1)
                 ),
                 pixel
             );
         }
     }
 }

 // Converts the src image to grayscale
 __kernel void grayscale(
     __read_only image2d_t src,
     __write_only image2d_t grayscale
 ) {
     int2 loc = (int2)(get_global_id(0), get_global_id(1));
     write_imagef(grayscale, loc, (float4)(pixel_grayscale(src, loc), 0.0f, 0.0f, 1.0f));
 }

 // This kernel computes the harris response for the given grayscale src image
 // within the given radius and writes it to harris_buf
 __kernel void harris_response(
     __read_only image2d_t grayscale,
     __global float *harris_buf
 ) {
     int2 loc = (int2)(get_global_id(0), get_global_id(1));

     if (loc.x > get_image_width(grayscale) - 1 || loc.y > get_image_height(grayscale) - 1) {
         write_to_1d_arrf(harris_buf, loc, 0);
         return;
     }

     float scale = 1.0f / ((1 << 2) * HARRIS_RADIUS * 255.0f);

     float sobel_mask_x[3][3] = {
         {-1, 0, 1},
         {-2, 0, 2},
         {-1, 0, 1}
     };

     float sobel_mask_y[3][3] = {
         { 1,  2,  1},
         { 0,  0,  0},
         {-1, -2, -1}
     };

     // 8 x 8 local work + 3 pixels around each side (needed to accomodate for the
     // block size radius of 2)
     __local float grayscale_data[196];

     int idx = get_group_id(0) * get_local_size(0);
     int idy = get_group_id(1) * get_local_size(1);

     for (int i = idy - 3, it = 0; i < idy + (int)get_local_size(1) + 3; i++, it++) {
         for (int j = idx - 3, jt = 0; j < idx + (int)get_local_size(0) + 3; j++, jt++) {
             grayscale_data[jt + it * 14] = read_imagef(grayscale, sampler, (int2)(j, i)).x;
         }
     }

     barrier(CLK_LOCAL_MEM_FENCE);

     float sumdxdy = sum_deriv_prod(grayscale_data, sobel_mask_x, sobel_mask_y);
     float sumdx2 = sum_deriv_pow(grayscale_data, sobel_mask_x);
     float sumdy2 = sum_deriv_pow(grayscale_data, sobel_mask_y);

     float trace = sumdx2 + sumdy2;
     // r = det(M) - k(trace(M))^2
     // k usually between 0.04 to 0.06
     float r = (sumdx2 * sumdy2 - sumdxdy * sumdxdy) - 0.04f * (trace * trace) * pown(scale, 4);

     // Threshold the r value
     harris_buf[loc.x + loc.y * get_image_width(grayscale)] = r * step(HARRIS_THRESHOLD, r);
 }

 // Gets a patch centered around a float coordinate from a grayscale image using
 // bilinear interpolation
 static void get_rect_sub_pix(
     __read_only image2d_t grayscale,
     float *buffer,
     int size_x,
     int size_y,
     float2 center
 ) {
     float2 offset = ((float2)(size_x, size_y) - 1.0f) * 0.5f;

     for (int i = 0; i < size_y; i++) {
         for (int j = 0; j < size_x; j++) {
             buffer[i * size_x + j] = read_imagef(
                 grayscale,
                 sampler_linear,
                 (float2)(j, i) + center - offset
             ).x * 255.0f;
         }
     }
 }

 // Refines detected features at a sub-pixel level
 //
 // This function is ported from OpenCV
 static float2 corner_sub_pix(
     __read_only image2d_t grayscale,
     float2 feature,
     float *mask
 ) {
     float2 init = feature;
     int src_width = get_global_size(0);
     int src_height = get_global_size(1);

     const int max_iters = 40;
     const float eps = 0.001f * 0.001f;
     int i, j, k;

     int iter = 0;
     float err = 0;
     float subpix[(REFINE_WIN_W + 2) * (REFINE_WIN_H + 2)];
     const float flt_epsilon = 0x1.0p-23f;

     do {
         float2 feature_tmp;
         float a = 0, b = 0, c = 0, bb1 = 0, bb2 = 0;

         get_rect_sub_pix(grayscale, subpix, REFINE_WIN_W + 2, REFINE_WIN_H + 2, feature);
         float *subpix_ptr = subpix;
         subpix_ptr += REFINE_WIN_W + 2 + 1;

         // process gradient
         for (i = 0, k = 0; i < REFINE_WIN_H; i++, subpix_ptr += REFINE_WIN_W + 2) {
             float py = i - REFINE_WIN_HALF_H;

             for (j = 0; j < REFINE_WIN_W; j++, k++) {
                 float m = mask[k];
                 float tgx = subpix_ptr[j + 1] - subpix_ptr[j - 1];
                 float tgy = subpix_ptr[j + REFINE_WIN_W + 2] - subpix_ptr[j - REFINE_WIN_W - 2];
                 float gxx = tgx * tgx * m;
                 float gxy = tgx * tgy * m;
                 float gyy = tgy * tgy * m;
                 float px = j - REFINE_WIN_HALF_W;

                 a += gxx;
                 b += gxy;
                 c += gyy;

                 bb1 += gxx * px + gxy * py;
                 bb2 += gxy * px + gyy * py;
             }
         }

         float det = a * c - b * b;
         if (fabs(det) <= flt_epsilon * flt_epsilon) {
             break;
         }

         // 2x2 matrix inversion
         float scale = 1.0f / det;
         feature_tmp.x = (float)(feature.x + (c * scale * bb1) - (b * scale * bb2));
         feature_tmp.y = (float)(feature.y - (b * scale * bb1) + (a * scale * bb2));
         err = dot(feature_tmp - feature, feature_tmp - feature);

         feature = feature_tmp;
         if (feature.x < 0 || feature.x >= src_width || feature.y < 0 || feature.y >= src_height) {
             break;
         }
     } while (++iter < max_iters && err > eps);

     // Make sure new point isn't too far from the initial point (indicates poor convergence)
     if (fabs(feature.x - init.x) > REFINE_WIN_HALF_W || fabs(feature.y - init.y) > REFINE_WIN_HALF_H) {
         feature = init;
     }

     return feature;
 }

 // Performs non-maximum suppression on the harris response and writes the resulting
 // feature locations to refined_features.
 //
 // Assumes that refined_features and the global work sizes are set up such that the image
 // is split up into a grid of 32x32 blocks where each block has a single slot in the
 // refined_features buffer. This kernel finds the best corner in each block (if the
 // block has any) and writes it to the corresponding slot in the buffer.
 //
 // If subpixel_refine is true, the features are additionally refined at a sub-pixel
 // level for increased precision.
 __kernel void refine_features(
     __read_only image2d_t grayscale,
     __global const float *harris_buf,
     __global float2 *refined_features,
     int subpixel_refine
 ) {
     int2 loc = (int2)(get_global_id(0), get_global_id(1));
     // The location in the grayscale buffer rather than the compacted grid
     int2 loc_i = (int2)(loc.x * 32, loc.y * 32);

     float new_val;
     float max_val = 0;
     float2 loc_max = (float2)(-1, -1);

     int end_x = min(loc_i.x + 32, (int)get_image_dim(grayscale).x - 1);
     int end_y = min(loc_i.y + 32, (int)get_image_dim(grayscale).y - 1);

     for (int i = loc_i.x; i < end_x; ++i) {
         for (int j = loc_i.y; j < end_y; ++j) {
             new_val = harris_buf[i + j * get_image_dim(grayscale).x];

             if (new_val > max_val) {
                 max_val = new_val;
                 loc_max = (float2)(i, j);
             }
         }
     }

     if (max_val == 0) {
         // There are no features in this part of the frame
         write_to_1d_arrf2(refined_features, loc, loc_max);
         return;
     }

     if (subpixel_refine) {
         float mask[REFINE_WIN_H * REFINE_WIN_W];
         for (int i = 0; i < REFINE_WIN_H; i++) {
             float y = (float)(i - REFINE_WIN_HALF_H) / REFINE_WIN_HALF_H;
             float vy = exp(-y * y);

             for (int j = 0; j < REFINE_WIN_W; j++) {
                 float x = (float)(j - REFINE_WIN_HALF_W) / REFINE_WIN_HALF_W;
                 mask[i * REFINE_WIN_W + j] = (float)(vy * exp(-x * x));
             }
         }

         loc_max = corner_sub_pix(grayscale, loc_max, mask);
     }

     write_to_1d_arrf2(refined_features, loc, loc_max);
 }

 // Extracts BRIEF descriptors from the grayscale src image for the given features
 // using the provided sampler.
 __kernel void brief_descriptors(
     __read_only image2d_t grayscale,
     __global const float2 *refined_features,
     // for 512 bit descriptors
     __global ulong8 *desc_buf,
     __global const PointPair *brief_pattern
 ) {
     int2 loc = (int2)(get_global_id(0), get_global_id(1));
     float2 feature = read_from_1d_arrf2(refined_features, loc);

     // There was no feature in this part of the frame
     if (feature.x == -1) {
         write_to_1d_arrul8(desc_buf, loc, (ulong8)(0));
         return;
     }

     ulong8 desc = 0;
     ulong *p = &desc;

     for (int i = 0; i < 8; ++i) {
         for (int j = 0; j < 64; ++j) {
             PointPair pair = brief_pattern[j * (i + 1)];
             float l1 = read_imagef(grayscale, sampler_linear, feature + pair.p1).x;
             float l2 = read_imagef(grayscale, sampler_linear, feature + pair.p2).x;

             if (l1 < l2) {
                 p[i] |= 1UL << j;
             }
         }
     }

     write_to_1d_arrul8(desc_buf, loc, desc);
 }

 // Given buffers with descriptors for the current and previous frame, determines
 // which ones match, writing correspondences to matches_buf.
 //
 // Feature and descriptor buffers are assumed to be compacted (each element sourced
 // from a 32x32 block in the frame being processed).
 __kernel void match_descriptors(
     __global const float2 *prev_refined_features,
     __global const float2 *refined_features,
     __global const ulong8 *desc_buf,
     __global const ulong8 *prev_desc_buf,
     __global MotionVector *matches_buf
 ) {
     int2 loc = (int2)(get_global_id(0), get_global_id(1));
     ulong8 desc = read_from_1d_arrul8(desc_buf, loc);
     const int search_radius = 3;

     MotionVector invalid_vector = (MotionVector) {
         (PointPair) {
             (float2)(-1, -1),
             (float2)(-1, -1)
         },
         0
     };

     if (desc.s0 == 0 && desc.s1 == 0) {
         // There was no feature in this part of the frame
         write_to_1d_arrvec(
             matches_buf,
             loc,
             invalid_vector
         );
         return;
     }

     int2 start = max(loc - search_radius, 0);
     int2 end = min(loc + search_radius, (int2)(get_global_size(0) - 1, get_global_size(1) - 1));

     for (int i = start.x; i < end.x; ++i) {
         for (int j = start.y; j < end.y; ++j) {
             int2 prev_point = (int2)(i, j);
             int total_dist = 0;

             ulong8 prev_desc = read_from_1d_arrul8(prev_desc_buf, prev_point);

             if (prev_desc.s0 == 0 && prev_desc.s1 == 0) {
                 continue;
             }

             ulong *prev_desc_p = &prev_desc;
             ulong *desc_p = &desc;

             for (int i = 0; i < 8; i++) {
                 total_dist += popcount(desc_p[i] ^ prev_desc_p[i]);
             }

             if (total_dist < DISTANCE_THRESHOLD) {
                 write_to_1d_arrvec(
                     matches_buf,
                     loc,
                     (MotionVector) {
                         (PointPair) {
                             read_from_1d_arrf2(prev_refined_features, prev_point),
                             read_from_1d_arrf2(refined_features, loc)
                         },
                         1
                     }
                 );

                 return;
             }
         }
     }

     // There is no found match for this point
     write_to_1d_arrvec(
         matches_buf,
         loc,
         invalid_vector
     );
 }

 // Returns the position of the given point after the transform is applied
 static float2 transformed_point(float2 p, __global const float *transform) {
     float2 ret;

     ret.x = p.x * transform[0] + p.y * transform[1] + transform[2];
     ret.y = p.x * transform[3] + p.y * transform[4] + transform[5];

     return ret;
 }


 // Performs the given transform on the src image
 __kernel void transform(
     __read_only image2d_t src,
     __write_only image2d_t dst,
     __global const float *transform
 ) {
     int2 loc = (int2)(get_global_id(0), get_global_id(1));
     float2 norm = convert_float2(get_image_dim(src));

     write_imagef(
         dst,
         loc,
         read_imagef(
             src,
             sampler_linear_mirror,
             transformed_point((float2)(loc.x, loc.y), transform) / norm
         )
     );
 }

 // Returns the new location of the given point using the given crop bounding box
 // and the width and height of the original frame.
 static float2 cropped_point(
     float2 p,
     float2 top_left,
     float2 bottom_right,
     int2 orig_dim
 ) {
     float2 ret;

     float crop_width  = bottom_right.x - top_left.x;
     float crop_height = bottom_right.y - top_left.y;

     float width_norm = p.x / (float)orig_dim.x;
     float height_norm = p.y / (float)orig_dim.y;

     ret.x = (width_norm * crop_width) + top_left.x;
     ret.y = (height_norm * crop_height) + ((float)orig_dim.y - bottom_right.y);

     return ret;
 }

 // Upscales the given cropped region to the size of the original frame
 __kernel void crop_upscale(
     __read_only image2d_t src,
     __write_only image2d_t dst,
     float2 top_left,
     float2 bottom_right
 ) {
     int2 loc = (int2)(get_global_id(0), get_global_id(1));

     write_imagef(
         dst,
         loc,
         read_imagef(
             src,
             sampler_linear,
             cropped_point((float2)(loc.x, loc.y), top_left, bottom_right, get_image_dim(dst))
         )
     );
 }

 // Draws boxes to represent the given point matches and uses the given transform
 // and crop info to make sure their positions are accurate on the transformed frame.
 //
 // model_matches is an array of three points that were used by the RANSAC process
 // to generate the given transform
 __kernel void draw_debug_info(
     __write_only image2d_t dst,
     __global const MotionVector *matches,
     __global const MotionVector *model_matches,
     int num_model_matches,
     __global const float *transform
 ) {
     int loc = get_global_id(0);
     MotionVector vec = matches[loc];
     // Black box: matched point that RANSAC considered an outlier
     float4 big_rect_color = (float4)(0.1f, 0.1f, 0.1f, 1.0f);

     if (vec.should_consider) {
         // Green box: matched point that RANSAC considered an inlier
         big_rect_color = (float4)(0.0f, 1.0f, 0.0f, 1.0f);
     }

     for (int i = 0; i < num_model_matches; i++) {
         if (vec.p.p2.x == model_matches[i].p.p2.x && vec.p.p2.y == model_matches[i].p.p2.y) {
             // Orange box: point used to calculate model
             big_rect_color = (float4)(1.0f, 0.5f, 0.0f, 1.0f);
         }
     }

     float2 transformed_p1 = transformed_point(vec.p.p1, transform);
     float2 transformed_p2 = transformed_point(vec.p.p2, transform);

     draw_box(dst, (int2)(transformed_p2.x, transformed_p2.y), big_rect_color, 5);
     // Small light blue box: the point in the previous frame
     draw_box(dst, (int2)(transformed_p1.x, transformed_p1.y), (float4)(0.0f, 0.3f, 0.7f, 1.0f), 3);
 }
	/*
	* This file is part of FFmpeg.
	*
	* FFmpeg is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* FFmpeg is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with FFmpeg; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*
	* Copyright (C) 2000, Intel Corporation, all rights reserved.
	* Copyright (C) 2013, OpenCV Foundation, all rights reserved.
	* Third party copyrights are property of their respective owners.
	*
	* Redistribution and use in source and binary forms, with or without modification,
	* are permitted provided that the following conditions are met:
	*
	* * Redistribution's of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* * Redistribution's in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* * The name of the copyright holders may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* This software is provided by the copyright holders and contributors "as is" and
	* any express or implied warranties, including, but not limited to, the implied
	* warranties of merchantability and fitness for a particular purpose are disclaimed.
	* In no event shall the Intel Corporation or contributors be liable for any direct,
	* indirect, incidental, special, exemplary, or consequential damages
	* (including, but not limited to, procurement of substitute goods or services;
	* loss of use, data, or profits; or business interruption) however caused
	* and on any theory of liability, whether in contract, strict liability,
	* or tort (including negligence or otherwise) arising in any way out of
	* the use of this software, even if advised of the possibility of such damage.
	*/

	#define HARRIS_THRESHOLD 3.0f
	// Block size over which to compute harris response
	//
	// Note that changing this will require fiddling with the local array sizes in
	// harris_response
	#define HARRIS_RADIUS 2
	#define DISTANCE_THRESHOLD 80

	// Sub-pixel refinement window for feature points
	#define REFINE_WIN_HALF_W 5
	#define REFINE_WIN_HALF_H 5
	#define REFINE_WIN_W 11 // REFINE_WIN_HALF_W * 2 + 1
	#define REFINE_WIN_H 11

	// Non-maximum suppression window size
	#define NONMAX_WIN 30
	#define NONMAX_WIN_HALF 15 // NONMAX_WIN / 2

	typedef struct PointPair {
	// Previous frame
	float2 p1;
	// Current frame
	float2 p2;
	} PointPair;

	typedef struct SmoothedPointPair {
	// Non-smoothed point in current frame
	int2 p1;
	// Smoothed point in current frame
	float2 p2;
	} SmoothedPointPair;

	typedef struct MotionVector {
	PointPair p;
	// Used to mark vectors as potential outliers
	int should_consider;
	} MotionVector;

	const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \|
	CLK_ADDRESS_CLAMP_TO_EDGE \|
	CLK_FILTER_NEAREST;

	const sampler_t sampler_linear = CLK_NORMALIZED_COORDS_FALSE \|
	CLK_ADDRESS_CLAMP_TO_EDGE \|
	CLK_FILTER_LINEAR;

	const sampler_t sampler_linear_mirror = CLK_NORMALIZED_COORDS_TRUE \|
	CLK_ADDRESS_MIRRORED_REPEAT \|
	CLK_FILTER_LINEAR;

	// Writes to a 1D array at loc, treating it as a 2D array with the same
	// dimensions as the global work size.
	static void write_to_1d_arrf(__global float *buf, int2 loc, float val) {
	buf[loc.x + loc.y * get_global_size(0)] = val;
	}

	static void write_to_1d_arrul8(__global ulong8 *buf, int2 loc, ulong8 val) {
	buf[loc.x + loc.y * get_global_size(0)] = val;
	}

	static void write_to_1d_arrvec(__global MotionVector *buf, int2 loc, MotionVector val) {
	buf[loc.x + loc.y * get_global_size(0)] = val;
	}

	static void write_to_1d_arrf2(__global float2 *buf, int2 loc, float2 val) {
	buf[loc.x + loc.y * get_global_size(0)] = val;
	}

	static ulong8 read_from_1d_arrul8(__global const ulong8 *buf, int2 loc) {
	return buf[loc.x + loc.y * get_global_size(0)];
	}

	static float2 read_from_1d_arrf2(__global const float2 *buf, int2 loc) {
	return buf[loc.x + loc.y * get_global_size(0)];
	}

	// Returns the grayscale value at the given point.
	static float pixel_grayscale(__read_only image2d_t src, int2 loc) {
	float4 pixel = read_imagef(src, sampler, loc);
	return (pixel.x + pixel.y + pixel.z) / 3.0f;
	}

	static float convolve(
	__local const float *grayscale,
	int local_idx_x,
	int local_idx_y,
	float mask[3][3]
	) {
	float ret = 0;

	// These loops touch each pixel surrounding loc as well as loc itself
	for (int i = 1, i2 = 0; i >= -1; --i, ++i2) {
	for (int j = -1, j2 = 0; j <= 1; ++j, ++j2) {
	ret += mask[i2][j2] * grayscale[(local_idx_x + 3 + j) + (local_idx_y + 3 + i) * 14];
	}
	}

	return ret;
	}

	// Sums dx * dy for all pixels within radius of loc
	static float sum_deriv_prod(
	__local const float *grayscale,
	float mask_x[3][3],
	float mask_y[3][3]
	) {
	float ret = 0;

	for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) {
	for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) {
	ret += convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_x) *
	convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_y);
	}
	}

	return ret;
	}

	// Sums d<>^2 (determined by mask) for all pixels within radius of loc
	static float sum_deriv_pow(__local const float *grayscale, float mask[3][3])
	{
	float ret = 0;

	for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) {
	for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) {
	float deriv = convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask);
	ret += deriv * deriv;
	}
	}

	return ret;
	}

	// Fills a box with the given radius and pixel around loc
	static void draw_box(__write_only image2d_t dst, int2 loc, float4 pixel, int radius)
	{
	for (int i = -radius; i <= radius; ++i) {
	for (int j = -radius; j <= radius; ++j) {
	write_imagef(
	dst,
	(int2)(
	// Clamp to avoid writing outside image bounds
	clamp(loc.x + i, 0, get_image_dim(dst).x - 1),
	clamp(loc.y + j, 0, get_image_dim(dst).y - 1)
	),
	pixel
	);
	}
	}
	}

	// Converts the src image to grayscale
	__kernel void grayscale(
	__read_only image2d_t src,
	__write_only image2d_t grayscale
	) {
	int2 loc = (int2)(get_global_id(0), get_global_id(1));
	write_imagef(grayscale, loc, (float4)(pixel_grayscale(src, loc), 0.0f, 0.0f, 1.0f));
	}

	// This kernel computes the harris response for the given grayscale src image
	// within the given radius and writes it to harris_buf
	__kernel void harris_response(
	__read_only image2d_t grayscale,
	__global float *harris_buf
	) {
	int2 loc = (int2)(get_global_id(0), get_global_id(1));

	if (loc.x > get_image_width(grayscale) - 1 \|\| loc.y > get_image_height(grayscale) - 1) {
	write_to_1d_arrf(harris_buf, loc, 0);
	return;
	}

	float scale = 1.0f / ((1 << 2) * HARRIS_RADIUS * 255.0f);

	float sobel_mask_x[3][3] = {
	{-1, 0, 1},
	{-2, 0, 2},
	{-1, 0, 1}
	};

	float sobel_mask_y[3][3] = {
	{ 1, 2, 1},
	{ 0, 0, 0},
	{-1, -2, -1}
	};

	// 8 x 8 local work + 3 pixels around each side (needed to accomodate for the
	// block size radius of 2)
	__local float grayscale_data[196];

	int idx = get_group_id(0) * get_local_size(0);
	int idy = get_group_id(1) * get_local_size(1);

	for (int i = idy - 3, it = 0; i < idy + (int)get_local_size(1) + 3; i++, it++) {
	for (int j = idx - 3, jt = 0; j < idx + (int)get_local_size(0) + 3; j++, jt++) {
	grayscale_data[jt + it * 14] = read_imagef(grayscale, sampler, (int2)(j, i)).x;
	}
	}

	barrier(CLK_LOCAL_MEM_FENCE);

	float sumdxdy = sum_deriv_prod(grayscale_data, sobel_mask_x, sobel_mask_y);
	float sumdx2 = sum_deriv_pow(grayscale_data, sobel_mask_x);
	float sumdy2 = sum_deriv_pow(grayscale_data, sobel_mask_y);

	float trace = sumdx2 + sumdy2;
	// r = det(M) - k(trace(M))^2
	// k usually between 0.04 to 0.06
	float r = (sumdx2 * sumdy2 - sumdxdy * sumdxdy) - 0.04f * (trace * trace) * pown(scale, 4);

	// Threshold the r value
	harris_buf[loc.x + loc.y * get_image_width(grayscale)] = r * step(HARRIS_THRESHOLD, r);
	}

	// Gets a patch centered around a float coordinate from a grayscale image using
	// bilinear interpolation
	static void get_rect_sub_pix(
	__read_only image2d_t grayscale,
	float *buffer,
	int size_x,
	int size_y,
	float2 center
	) {
	float2 offset = ((float2)(size_x, size_y) - 1.0f) * 0.5f;

	for (int i = 0; i < size_y; i++) {
	for (int j = 0; j < size_x; j++) {
	buffer[i * size_x + j] = read_imagef(
	grayscale,
	sampler_linear,
	(float2)(j, i) + center - offset
	).x * 255.0f;
	}
	}
	}

	// Refines detected features at a sub-pixel level
	//
	// This function is ported from OpenCV
	static float2 corner_sub_pix(
	__read_only image2d_t grayscale,
	float2 feature,
	float *mask
	) {
	float2 init = feature;
	int src_width = get_global_size(0);
	int src_height = get_global_size(1);

	const int max_iters = 40;
	const float eps = 0.001f * 0.001f;
	int i, j, k;

	int iter = 0;
	float err = 0;
	float subpix[(REFINE_WIN_W + 2) * (REFINE_WIN_H + 2)];
	const float flt_epsilon = 0x1.0p-23f;

	do {
	float2 feature_tmp;
	float a = 0, b = 0, c = 0, bb1 = 0, bb2 = 0;

	get_rect_sub_pix(grayscale, subpix, REFINE_WIN_W + 2, REFINE_WIN_H + 2, feature);
	float *subpix_ptr = subpix;
	subpix_ptr += REFINE_WIN_W + 2 + 1;

	// process gradient
	for (i = 0, k = 0; i < REFINE_WIN_H; i++, subpix_ptr += REFINE_WIN_W + 2) {
	float py = i - REFINE_WIN_HALF_H;

	for (j = 0; j < REFINE_WIN_W; j++, k++) {
	float m = mask[k];
	float tgx = subpix_ptr[j + 1] - subpix_ptr[j - 1];
	float tgy = subpix_ptr[j + REFINE_WIN_W + 2] - subpix_ptr[j - REFINE_WIN_W - 2];
	float gxx = tgx * tgx * m;
	float gxy = tgx * tgy * m;
	float gyy = tgy * tgy * m;
	float px = j - REFINE_WIN_HALF_W;

	a += gxx;
	b += gxy;
	c += gyy;

	bb1 += gxx * px + gxy * py;
	bb2 += gxy * px + gyy * py;
	}
	}

	float det = a * c - b * b;
	if (fabs(det) <= flt_epsilon * flt_epsilon) {
	break;
	}

	// 2x2 matrix inversion
	float scale = 1.0f / det;
	feature_tmp.x = (float)(feature.x + (c * scale * bb1) - (b * scale * bb2));
	feature_tmp.y = (float)(feature.y - (b * scale * bb1) + (a * scale * bb2));
	err = dot(feature_tmp - feature, feature_tmp - feature);

	feature = feature_tmp;
	if (feature.x < 0 \|\| feature.x >= src_width \|\| feature.y < 0 \|\| feature.y >= src_height) {
	break;
	}
	} while (++iter < max_iters && err > eps);

	// Make sure new point isn't too far from the initial point (indicates poor convergence)
	if (fabs(feature.x - init.x) > REFINE_WIN_HALF_W \|\| fabs(feature.y - init.y) > REFINE_WIN_HALF_H) {
	feature = init;
	}

	return feature;
	}

	// Performs non-maximum suppression on the harris response and writes the resulting
	// feature locations to refined_features.
	//
	// Assumes that refined_features and the global work sizes are set up such that the image
	// is split up into a grid of 32x32 blocks where each block has a single slot in the
	// refined_features buffer. This kernel finds the best corner in each block (if the
	// block has any) and writes it to the corresponding slot in the buffer.
	//
	// If subpixel_refine is true, the features are additionally refined at a sub-pixel
	// level for increased precision.
	__kernel void refine_features(
	__read_only image2d_t grayscale,
	__global const float *harris_buf,
	__global float2 *refined_features,
	int subpixel_refine
	) {
	int2 loc = (int2)(get_global_id(0), get_global_id(1));
	// The location in the grayscale buffer rather than the compacted grid
	int2 loc_i = (int2)(loc.x * 32, loc.y * 32);

	float new_val;
	float max_val = 0;
	float2 loc_max = (float2)(-1, -1);

	int end_x = min(loc_i.x + 32, (int)get_image_dim(grayscale).x - 1);
	int end_y = min(loc_i.y + 32, (int)get_image_dim(grayscale).y - 1);

	for (int i = loc_i.x; i < end_x; ++i) {
	for (int j = loc_i.y; j < end_y; ++j) {
	new_val = harris_buf[i + j * get_image_dim(grayscale).x];

	if (new_val > max_val) {
	max_val = new_val;
	loc_max = (float2)(i, j);
	}
	}
	}

	if (max_val == 0) {
	// There are no features in this part of the frame
	write_to_1d_arrf2(refined_features, loc, loc_max);
	return;
	}

	if (subpixel_refine) {
	float mask[REFINE_WIN_H * REFINE_WIN_W];
	for (int i = 0; i < REFINE_WIN_H; i++) {
	float y = (float)(i - REFINE_WIN_HALF_H) / REFINE_WIN_HALF_H;
	float vy = exp(-y * y);

	for (int j = 0; j < REFINE_WIN_W; j++) {
	float x = (float)(j - REFINE_WIN_HALF_W) / REFINE_WIN_HALF_W;
	mask[i * REFINE_WIN_W + j] = (float)(vy * exp(-x * x));
	}
	}

	loc_max = corner_sub_pix(grayscale, loc_max, mask);
	}

	write_to_1d_arrf2(refined_features, loc, loc_max);
	}

	// Extracts BRIEF descriptors from the grayscale src image for the given features
	// using the provided sampler.
	__kernel void brief_descriptors(
	__read_only image2d_t grayscale,
	__global const float2 *refined_features,
	// for 512 bit descriptors
	__global ulong8 *desc_buf,
	__global const PointPair *brief_pattern
	) {
	int2 loc = (int2)(get_global_id(0), get_global_id(1));
	float2 feature = read_from_1d_arrf2(refined_features, loc);

	// There was no feature in this part of the frame
	if (feature.x == -1) {
	write_to_1d_arrul8(desc_buf, loc, (ulong8)(0));
	return;
	}

	ulong8 desc = 0;
	ulong *p = &desc;

	for (int i = 0; i < 8; ++i) {
	for (int j = 0; j < 64; ++j) {
	PointPair pair = brief_pattern[j * (i + 1)];
	float l1 = read_imagef(grayscale, sampler_linear, feature + pair.p1).x;
	float l2 = read_imagef(grayscale, sampler_linear, feature + pair.p2).x;

	if (l1 < l2) {
	p[i] \|= 1UL << j;
	}
	}
	}

	write_to_1d_arrul8(desc_buf, loc, desc);
	}

	// Given buffers with descriptors for the current and previous frame, determines
	// which ones match, writing correspondences to matches_buf.
	//
	// Feature and descriptor buffers are assumed to be compacted (each element sourced
	// from a 32x32 block in the frame being processed).
	__kernel void match_descriptors(
	__global const float2 *prev_refined_features,
	__global const float2 *refined_features,
	__global const ulong8 *desc_buf,
	__global const ulong8 *prev_desc_buf,
	__global MotionVector *matches_buf
	) {
	int2 loc = (int2)(get_global_id(0), get_global_id(1));
	ulong8 desc = read_from_1d_arrul8(desc_buf, loc);
	const int search_radius = 3;

	MotionVector invalid_vector = (MotionVector) {
	(PointPair) {
	(float2)(-1, -1),
	(float2)(-1, -1)
	},
	0
	};

	if (desc.s0 == 0 && desc.s1 == 0) {
	// There was no feature in this part of the frame
	write_to_1d_arrvec(
	matches_buf,
	loc,
	invalid_vector
	);
	return;
	}

	int2 start = max(loc - search_radius, 0);
	int2 end = min(loc + search_radius, (int2)(get_global_size(0) - 1, get_global_size(1) - 1));

	for (int i = start.x; i < end.x; ++i) {
	for (int j = start.y; j < end.y; ++j) {
	int2 prev_point = (int2)(i, j);
	int total_dist = 0;

	ulong8 prev_desc = read_from_1d_arrul8(prev_desc_buf, prev_point);

	if (prev_desc.s0 == 0 && prev_desc.s1 == 0) {
	continue;
	}

	ulong *prev_desc_p = &prev_desc;
	ulong *desc_p = &desc;

	for (int i = 0; i < 8; i++) {
	total_dist += popcount(desc_p[i] ^ prev_desc_p[i]);
	}

	if (total_dist < DISTANCE_THRESHOLD) {
	write_to_1d_arrvec(
	matches_buf,
	loc,
	(MotionVector) {
	(PointPair) {
	read_from_1d_arrf2(prev_refined_features, prev_point),
	read_from_1d_arrf2(refined_features, loc)
	},
	1
	}
	);

	return;
	}
	}
	}

	// There is no found match for this point
	write_to_1d_arrvec(
	matches_buf,
	loc,
	invalid_vector
	);
	}

	// Returns the position of the given point after the transform is applied
	static float2 transformed_point(float2 p, __global const float *transform) {
	float2 ret;

	ret.x = p.x * transform[0] + p.y * transform[1] + transform[2];
	ret.y = p.x * transform[3] + p.y * transform[4] + transform[5];

	return ret;
	}


	// Performs the given transform on the src image
	__kernel void transform(
	__read_only image2d_t src,
	__write_only image2d_t dst,
	__global const float *transform
	) {
	int2 loc = (int2)(get_global_id(0), get_global_id(1));
	float2 norm = convert_float2(get_image_dim(src));

	write_imagef(
	dst,
	loc,
	read_imagef(
	src,
	sampler_linear_mirror,
	transformed_point((float2)(loc.x, loc.y), transform) / norm
	)
	);
	}

	// Returns the new location of the given point using the given crop bounding box
	// and the width and height of the original frame.
	static float2 cropped_point(
	float2 p,
	float2 top_left,
	float2 bottom_right,
	int2 orig_dim
	) {
	float2 ret;

	float crop_width = bottom_right.x - top_left.x;
	float crop_height = bottom_right.y - top_left.y;

	float width_norm = p.x / (float)orig_dim.x;
	float height_norm = p.y / (float)orig_dim.y;

	ret.x = (width_norm * crop_width) + top_left.x;
	ret.y = (height_norm * crop_height) + ((float)orig_dim.y - bottom_right.y);

	return ret;
	}

	// Upscales the given cropped region to the size of the original frame
	__kernel void crop_upscale(
	__read_only image2d_t src,
	__write_only image2d_t dst,
	float2 top_left,
	float2 bottom_right
	) {
	int2 loc = (int2)(get_global_id(0), get_global_id(1));

	write_imagef(
	dst,
	loc,
	read_imagef(
	src,
	sampler_linear,
	cropped_point((float2)(loc.x, loc.y), top_left, bottom_right, get_image_dim(dst))
	)
	);
	}

	// Draws boxes to represent the given point matches and uses the given transform
	// and crop info to make sure their positions are accurate on the transformed frame.
	//
	// model_matches is an array of three points that were used by the RANSAC process
	// to generate the given transform
	__kernel void draw_debug_info(
	__write_only image2d_t dst,
	__global const MotionVector *matches,
	__global const MotionVector *model_matches,
	int num_model_matches,
	__global const float *transform
	) {
	int loc = get_global_id(0);
	MotionVector vec = matches[loc];
	// Black box: matched point that RANSAC considered an outlier
	float4 big_rect_color = (float4)(0.1f, 0.1f, 0.1f, 1.0f);

	if (vec.should_consider) {
	// Green box: matched point that RANSAC considered an inlier
	big_rect_color = (float4)(0.0f, 1.0f, 0.0f, 1.0f);
	}

	for (int i = 0; i < num_model_matches; i++) {
	if (vec.p.p2.x == model_matches[i].p.p2.x && vec.p.p2.y == model_matches[i].p.p2.y) {
	// Orange box: point used to calculate model
	big_rect_color = (float4)(1.0f, 0.5f, 0.0f, 1.0f);
	}
	}

	float2 transformed_p1 = transformed_point(vec.p.p1, transform);
	float2 transformed_p2 = transformed_point(vec.p.p2, transform);

	draw_box(dst, (int2)(transformed_p2.x, transformed_p2.y), big_rect_color, 5);
	// Small light blue box: the point in the previous frame
	draw_box(dst, (int2)(transformed_p1.x, transformed_p1.y), (float4)(0.0f, 0.3f, 0.7f, 1.0f), 3);
	}