blob: fd0fb6aadfd84909f0f76a3c8586947d09d5f51a [file] [log] [blame]
/*
* Copyright (c) 2012-2017 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef OPENVX_USE_NN
#include "test_tensor_util.h"
TESTCASE(TensorNN, CT_VXContext, ct_setup_vx_context, 0)
/****************************************************************************
* *
* Test vxConvolutionLayer *
* *
***************************************************************************/
static void ownGetConvPoolRandParams(
uint64_t * rng,
size_t pad_sz, size_t kernel_sz,
size_t dilation, // 0 in pooling
bool use_ceil,
/*OUT*/ size_t * input_sz,
/*OUT*/ size_t * stride,
/*OUT*/ size_t * output_sz)
{
const int min_input = MAX(kernel_sz + (kernel_sz - 1) * dilation - 2 * pad_sz, 1);
const int max_input = MIN(min_input, TEST_TENSOR_MAX_DIM_SZ) + 5;
*input_sz = (size_t)CT_RNG_NEXT_INT(*rng, min_input, max_input);
const size_t stride_candidate = (size_t)CT_RNG_NEXT_INT(*rng, 1, 4);
const size_t round_addition = use_ceil ? stride_candidate - 1 : 0;
const size_t numerator = *input_sz + 2 * pad_sz - (kernel_sz + (kernel_sz - 1) * dilation);
*output_sz = (numerator + round_addition) / stride_candidate + 1;
// There's an ambiguity in the stride determination for ex.
// 2 = (6 + 2 * 0 - 1) / stride + 1
// would be correct with a stride of 3, 4 or, 5.
// We therefore pick the smallest stride satisying the equation.
size_t t = stride_candidate;
while (t > 1 && (numerator + (use_ceil ? t - 2 : 0)) / (t - 1) + 1 == *output_sz) --t;
*stride = t;
}
static void ownConvolution(
enum TestTensorDF fmt,
const void * input_ptr, tensor_desc_t input,
const void * weight_ptr, tensor_desc_t weight,
const void * bias_ptr, tensor_desc_t bias,
vx_size pad_x, vx_size pad_y,
vx_size stride_x, vx_size stride_y,
bool wrap, // true for WRAP, else SATURATE
bool to_ne, // true for ROUND_TO_NE, else ROUND_TO_ZERO
vx_size dilation_x, vx_size dilation_y,
void * output_ptr, tensor_desc_t output)
{
assert(fmt == TT_Q78 || fmt == TT_U8 || fmt == TT_S8);
assert(input.dim_num == 3 || input.dim_num == 4);
assert(weight.dim_num == 4);
assert(bias.dim_num == 0 || bias.dim_num == 1 || bias.dim_num == 3);
assert(output.dim_num == input.dim_num);
const size_t input_w = input.dims[0];
const size_t input_h = input.dims[1];
const size_t input_c = input.dims[2];
const size_t input_b = input.dim_num > 3 ? input.dims[3] : 1;
const size_t weight_w = weight.dims[0];
const size_t weight_h = weight.dims[1];
const size_t weight_ifm = weight.dims[2];
const size_t weight_ofm = weight.dims[3];
const bool bias_present = !!bias.dim_num;
const bool bias_shared = bias.dim_num == 1;
const size_t bias_w = bias.dim_num > 0 ? bias.dims[0] : 0;
const size_t bias_h = bias.dim_num > 1 ? bias.dims[1] : 1;
const size_t bias_ofm = bias.dim_num > 2 ? bias.dims[2] : 1;
const size_t output_w = output.dims[0];
const size_t output_h = output.dims[1];
const size_t output_c = output.dims[2];
const size_t output_b = output.dim_num > 3 ? output.dims[3] : 1;
assert(weight_w + (weight_w - 1) * dilation_x <= input_w + 2 * pad_x);
assert(weight_h + (weight_h - 1) * dilation_y <= input_h + 2 * pad_y);
assert(weight_ifm == input_c);
assert(weight_ofm == output_c);
if (bias_shared)
{
assert(bias_w == weight_ofm);
}
else if (bias_present)
{
assert(bias_w == output_w);
assert(bias_h == output_h);
assert(bias_ofm == output_c);
}
assert(output_b == input_b);
ownAssertStridesModSizeof(fmt, input);
ownAssertStridesModSizeof(fmt, weight);
ownAssertStridesModSizeof(fmt, bias);
ownAssertStridesModSizeof(fmt, output);
// Input and output pointers for the current batch being processed,
// Note: The compiler should've been able to hoist this out... And
// there's a bunch of other possible hoising iopportunities here.
const char * in_b_ptr = input_ptr;
char * out_b_ptr = output_ptr;
for (size_t b = 0; b < output_b; ++b)
for (size_t ofm = 0; ofm < output_c; ++ofm)
for (size_t y = 0; y < output_h; ++y)
for (size_t x = 0; x < output_w; ++x)
{
int32_t sum = 0;
if (bias_present)
{
const size_t bias_byte_offset =
bias_shared
? (bias.strides[0] * ofm)
: (bias.strides[2] * ofm + bias.strides[1] * y + bias.strides[0] * x);
sum = ownLoadValueAsRawInt(fmt, (char *)bias_ptr + bias_byte_offset);
}
const size_t xx = x * stride_x;
const size_t yy = y * stride_y;
for (size_t ifm = 0; ifm < input_c; ++ifm)
{
for (size_t w_y = 0; w_y < weight_h; ++w_y)
for (size_t w_x = 0; w_x < weight_w; ++w_x)
{
const size_t tmp_x = xx + w_x * (dilation_x + 1) + dilation_x;
const size_t tmp_y = yy + w_y * (dilation_y + 1) + dilation_y;
if (tmp_x >= pad_x && tmp_x < input_w + pad_x &&
tmp_y >= pad_y && tmp_y < input_h + pad_y)
{
const size_t input_byte_offset =
(b ? input.strides[3] * b : 0) +
input.strides[2] * ifm +
input.strides[1] * (tmp_y - pad_y) +
input.strides[0] * (tmp_x - pad_x);
const size_t weight_byte_offset =
weight.strides[3] * ofm +
weight.strides[2] * ifm +
weight.strides[1] * w_y +
weight.strides[0] * w_x;
const int_fast32_t i_val = ownLoadValueAsRawInt(fmt, in_b_ptr + input_byte_offset);
const int_fast32_t w_val = ownLoadValueAsRawInt(fmt, (char *)weight_ptr + weight_byte_offset);
// This is ok since all of them fit into int32_t
sum = ownApplyWrapRoundingToAccum(fmt, i_val * w_val, wrap, to_ne) + sum;
}
}
sum = ownWrapOrSat(fmt, sum, wrap);
}
// The step here could be added to the loops instead of recalcing
// if, but does the compiler fail to hoist them out???
const size_t output_byte_offset =
(b ? output.strides[3] * b : 0) +
output.strides[2] * ofm +
output.strides[1] * y +
output.strides[0] * x;
ownStoreRawIntValue(fmt, sum, out_b_ptr + output_byte_offset);
}
}
enum TT_CONVOLUTION_BIAS_TYPE
{
BIAS_NONE,
BIAS_SHARED,
BIAS_PER_LOC,
};
typedef struct
{
const char * name;
enum TestTensorDF fmt;
vx_size weight_w;
vx_size weight_h;
vx_size padding_x;
vx_size padding_y;
enum vx_convert_policy_e convert_policy;
enum vx_round_policy_e rounding_policy;
enum vx_nn_rounding_type_e down_scale_size_rounding;
vx_size dilation_x;
vx_size dilation_y;
int batching_dim;
enum TT_CONVOLUTION_BIAS_TYPE bias_type;
} test_convolution_layer_arg;
#define TT_CONVOLUTION_CASES_BASE(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,D_X_,D_Y_,BATCH_,BIAS_) \
ARG(NAME_"_SZ_X"#SZ_X_"_Y"#SZ_Y_"_PAD_X"#PAD_X_"_Y"#PAD_Y_"_DILATION_X"#D_X_"_Y"#D_Y_, \
TT_##FMT_, SZ_X_, SZ_Y_, PAD_X_, PAD_Y_, VX_CONVERT_POLICY_##OF_, VX_ROUND_POLICY_TO_##ROUND_, \
VX_NN_DS_SIZE_ROUNDING_##DS_ROUND_, D_X_, D_Y_, BATCH_, BIAS_),
#define TT_CONVOLUTION_CASES_5(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,D_X_,D_Y_,BATCH_) \
TT_CONVOLUTION_CASES_BASE(NAME_"_NOBIAS",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,D_X_,D_Y_,BATCH_,BIAS_NONE) \
TT_CONVOLUTION_CASES_BASE(NAME_"_SHAREDBIAS",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,D_X_,D_Y_,BATCH_,BIAS_SHARED) \
TT_CONVOLUTION_CASES_BASE(NAME_"_PERLOCBIAS",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,D_X_,D_Y_,BATCH_,BIAS_PER_LOC)
#define TT_CONVOLUTION_CASES_4(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,D_X_,D_Y_) \
TT_CONVOLUTION_CASES_5(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,D_X_,D_Y_,0) \
TT_CONVOLUTION_CASES_5(NAME_"_BATCH",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,D_X_,D_Y_,1)
#define TT_CONVOLUTION_CASES_3(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_) \
TT_CONVOLUTION_CASES_4(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,0,0) \
TT_CONVOLUTION_CASES_4(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,0,1) \
TT_CONVOLUTION_CASES_4(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,DS_ROUND_,1,0)
#define TT_CONVOLUTION_CASES_2(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_) \
TT_CONVOLUTION_CASES_3(NAME_"_FLOOR",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,FLOOR) \
TT_CONVOLUTION_CASES_3(NAME_"_CEIL",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,CEILING)
#define TT_CONVOLUTION_CASES_1(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_) \
TT_CONVOLUTION_CASES_2(NAME_"_ZERO",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ZERO) \
TT_CONVOLUTION_CASES_2(NAME_"_NE",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,NEAREST_EVEN)
#define TT_CONVOLUTION_CASES_0(FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_) \
TT_CONVOLUTION_CASES_1(#FMT_"_WRAP",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,WRAP) \
TT_CONVOLUTION_CASES_1(#FMT_"_SAT",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,SATURATE)
#define TT_CONVOLUTION_CASES_EXTRA(FMT_) \
TT_CONVOLUTION_CASES_0(FMT_,3,4,1,2)
#define TT_CONVOLUTION_CASES_ALEXNET(FMT_) \
TT_CONVOLUTION_CASES_0(FMT_,11,11,0,0) \
TT_CONVOLUTION_CASES_0(FMT_,6,6,0,0) \
TT_CONVOLUTION_CASES_0(FMT_,5,5,0,0) \
TT_CONVOLUTION_CASES_0(FMT_,3,3,0,0)
#define TT_CONVOLUTION_CASES_ALL() \
TT_CONVOLUTION_CASES_ALEXNET(U8) \
TT_CONVOLUTION_CASES_EXTRA(U8)
TEST_WITH_ARG(TensorNN, testConvolutionLayer, test_convolution_layer_arg,
TT_CONVOLUTION_CASES_ALL()
)
{
assert (arg_->fmt == TT_Q78 || arg_->fmt == TT_U8 || arg_->fmt == TT_S8);
assert (arg_->batching_dim >= 0);
assert (arg_->bias_type == BIAS_NONE || arg_->bias_type == BIAS_SHARED || arg_->bias_type == BIAS_PER_LOC);
assert (arg_->convert_policy == VX_CONVERT_POLICY_WRAP ||
arg_->convert_policy == VX_CONVERT_POLICY_SATURATE);
assert (arg_->rounding_policy == VX_ROUND_POLICY_TO_ZERO ||
arg_->rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN);
assert (arg_->down_scale_size_rounding == VX_NN_DS_SIZE_ROUNDING_FLOOR ||
arg_->down_scale_size_rounding == VX_NN_DS_SIZE_ROUNDING_CEILING);
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims >= (size_t)(3 + arg_->batching_dim));
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type;
vx_uint8 fixed_point_position;
vx_size sizeof_data_type;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
const size_t inout_dim_num = 3 + arg_->batching_dim;
const size_t weight_dim_num = 4;
const size_t bias_dim_num =
arg_->bias_type == BIAS_NONE ? 0 :
arg_->bias_type == BIAS_SHARED ? 1 : 3;
size_t in_dims[4];
size_t weight_dims[4];
size_t bias_dims[3];
size_t out_dims[4];
size_t in_strides[4];
size_t weight_strides[4];
size_t bias_strides[3];
size_t out_strides[4];
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter);
fflush(stdout);
}
size_t input_w, stride_x, output_w;
ownGetConvPoolRandParams(
&rng,
arg_->padding_x, arg_->weight_w,
arg_->dilation_x,
arg_->down_scale_size_rounding == VX_NN_DS_SIZE_ROUNDING_CEILING,
&input_w, &stride_x, &output_w);
size_t input_h, stride_y, output_h;
ownGetConvPoolRandParams(
&rng,
arg_->padding_y, arg_->weight_h,
arg_->dilation_y,
arg_->down_scale_size_rounding == VX_NN_DS_SIZE_ROUNDING_CEILING,
&input_h, &stride_y, &output_h);
in_dims[0] = input_w;
in_dims[1] = input_h;
for (vx_size i = 2; i < inout_dim_num; ++i)
{
in_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
}
out_dims[0] = output_w;
out_dims[1] = output_h;
out_dims[2] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
for (vx_size i = 3; i < inout_dim_num; ++i)
{
out_dims[i] = in_dims[i];
}
weight_dims[0] = arg_->weight_w;
weight_dims[1] = arg_->weight_h;
weight_dims[2] = in_dims[2];
weight_dims[3] = out_dims[2];
if (bias_dim_num == 1) { bias_dims[0] = out_dims[2]; }
else if (bias_dim_num == 3)
{
bias_dims[0] = out_dims[0];
bias_dims[1] = out_dims[1];
bias_dims[2] = out_dims[2];
}
vx_tensor in_tensor = vxCreateTensor(context_->vx_context_, inout_dim_num, in_dims, data_type, fixed_point_position);
vx_tensor weight_tensor = vxCreateTensor(context_->vx_context_, weight_dim_num, weight_dims, data_type, fixed_point_position);
vx_tensor bias_tensor = bias_dim_num ? vxCreateTensor(context_->vx_context_, bias_dim_num, bias_dims, data_type, fixed_point_position) : NULL;
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, inout_dim_num, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(in_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(weight_tensor, VX_TYPE_TENSOR);
if (bias_dim_num) { ASSERT_VX_OBJECT(in_tensor, VX_TYPE_TENSOR); }
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
ownGetFlatByteStrides(arg_->fmt, in_dims, inout_dim_num, in_strides);
ownGetFlatByteStrides(arg_->fmt, weight_dims, weight_dim_num, weight_strides);
ownGetFlatByteStrides(arg_->fmt, bias_dims, bias_dim_num, bias_strides);
ownGetFlatByteStrides(arg_->fmt, out_dims, inout_dim_num, out_strides);
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t in_dims: { "); for (size_t i = 0; i < inout_dim_num; ++i) { printf("%zu, ", in_dims[i]); } printf(" }, \n");
printf("\t weight_dims: { "); for (size_t i = 0; i < weight_dim_num; ++i) { printf("%zu, ", weight_dims[i]); } printf(" }, \n");
if (bias_dim_num)
{
printf("\t bias_dims: { "); for (size_t i = 0; i < bias_dim_num; ++i) { printf("%zu, ", bias_dims[i]); } printf(" }, \n");
}
printf("\t out_dims: { "); for (size_t i = 0; i < inout_dim_num; ++i) { printf("%zu, ", out_dims[i]); } printf(" }, \n");
printf("\t }\n");
}
const size_t in_bytes = in_dims[inout_dim_num-1] * in_strides[inout_dim_num-1];
const size_t weight_bytes = weight_dims[weight_dim_num-1] * weight_strides[weight_dim_num-1];
const size_t bias_bytes = bias_dim_num ? bias_dims[bias_dim_num-1] * bias_strides[bias_dim_num-1] : 0;
const size_t out_bytes = out_dims[inout_dim_num-1] * out_strides[inout_dim_num-1];
const size_t in_count = in_bytes / sizeof_data_type;
const size_t weight_count = weight_bytes / sizeof_data_type;
const size_t bias_count = bias_bytes / sizeof_data_type;
void * const in = malloc(in_bytes);
void * const weight = malloc(weight_bytes);
void * const bias = bias_dim_num ? malloc(bias_bytes) : NULL;
void * const out = malloc(out_bytes);
void * const refs = malloc(out_bytes);
ASSERT(in && weight && (!bias_count || bias) && out && refs);
{
const int conv_prod_count = arg_->weight_w * arg_->weight_h * in_dims[2];
ownFillSmallRandData(arg_->fmt, &rng, in_count, conv_prod_count, in);
ownFillSmallRandData(arg_->fmt, &rng, weight_count, conv_prod_count, weight);
if (bias_dim_num) { ownFillRandData(arg_->fmt, &rng, bias_count, bias); }
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(in_tensor, inout_dim_num, view_start, in_dims, in_strides, in, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
VX_CALL(vxCopyTensorPatch(weight_tensor, weight_dim_num, view_start, weight_dims, weight_strides, weight, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
if (bias_dim_num)
{
VX_CALL(vxCopyTensorPatch(bias_tensor, bias_dim_num, view_start, bias_dims, bias_strides, bias, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
VX_CALL(vxCopyTensorPatch(out_tensor, inout_dim_num, view_start, out_dims, out_strides, out, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
{
vx_graph graph = vxCreateGraph(context_->vx_context_);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
const vx_nn_convolution_params_t params =
{
arg_->padding_x, arg_->padding_y, arg_->convert_policy, arg_->rounding_policy,
arg_->down_scale_size_rounding, arg_->dilation_x, arg_->dilation_y
};
vx_node node = vxConvolutionLayer(graph, in_tensor, weight_tensor, bias_tensor, &params, sizeof(params), out_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
tensor_desc_t in_td = { inout_dim_num, in_dims, in_strides };
tensor_desc_t weight_td = { weight_dim_num, weight_dims, weight_strides };
tensor_desc_t bias_td = { bias_dim_num, bias_dims, bias_strides };
tensor_desc_t out_td = { inout_dim_num, out_dims, out_strides };
ownConvolution(
arg_->fmt,
in, in_td,
weight, weight_td,
bias, bias_td,
arg_->padding_x, arg_->padding_y,
stride_x, stride_y,
arg_->convert_policy == VX_CONVERT_POLICY_WRAP,
arg_->rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN,
arg_->dilation_x, arg_->dilation_y,
refs, out_td);
const vx_size view_start[5] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, inout_dim_num, view_start, out_dims, out_strides, out, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out, out_dims, inout_dim_num, out_strides,
refs, out_dims, inout_dim_num, out_strides,
8, //0, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)refs + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&in_tensor));
VX_CALL(vxReleaseTensor(&weight_tensor));
if (bias_dim_num) { VX_CALL(vxReleaseTensor(&bias_tensor)); }
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, in_tensor);
EXPECT_EQ_PTR(NULL, weight_tensor);
EXPECT_EQ_PTR(NULL, bias_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
free(in);
free(weight);
free(bias);
free(out);
free(refs);
}
}
/****************************************************************************
* *
* Test vxFullyConnectedLayer *
* *
***************************************************************************/
static void ownFullyConnected(
enum TestTensorDF fmt,
const void * input_ptr, tensor_desc_t input,
const void * weight_ptr, tensor_desc_t weight,
const void * bias_ptr, tensor_desc_t bias,
bool wrap, // true for WRAP, else SATURATE
bool to_ne, // true for ROUND_TO_NE, else ROUND_TO_ZERO
void * output_ptr, tensor_desc_t output)
{
assert (fmt == TT_Q78 || fmt == TT_U8 || fmt == TT_S8);
const size_t batch_dim_num = output.dim_num - 1;
assert (batch_dim_num >= 0 && batch_dim_num <= 3);
const size_t core_dim_num = input.dim_num - batch_dim_num;
assert ((core_dim_num == 1 && weight.dim_num == 2) ||
(core_dim_num == 3 && (weight.dim_num == 2 || weight.dim_num == 4)));
assert (bias.dim_num == !!bias_ptr);
const bool bias_present = !!bias.dim_num;
if (core_dim_num == 1)
{
assert (weight.dims[0] == input.dims[0]);
}
else if (weight.dim_num == 2)
{
assert (weight.dims[0] == input.dims[0] * input.dims[1] * input.dims[2]);
}
else
{
assert (weight.dims[0] == input.dims[0]);
assert (weight.dims[1] == input.dims[1]);
assert (weight.dims[2] == input.dims[2]);
}
assert (weight.dims[weight.dim_num - 1] == output.dims[0]);
assert (!bias_present || bias.dims[0] == output.dims[0]);
for (size_t i = 0; i < batch_dim_num; ++i)
{
assert (output.dims[i + 1] == input.dims[i + core_dim_num]);
}
ownAssertStridesModSizeof(fmt, input);
ownAssertStridesModSizeof(fmt, weight);
ownAssertStridesModSizeof(fmt, bias);
ownAssertStridesModSizeof(fmt, output);
const size_t tmp_batch_dims[3] =
{
(batch_dim_num > 0 ? output.dims[1] : 1),
(batch_dim_num > 1 ? output.dims[2] : 1),
(batch_dim_num > 2 ? output.dims[3] : 1),
};
const size_t tmp_input_dims[3] =
{
(core_dim_num == 3 ? input.dims[0] : 1),
(core_dim_num == 3 ? input.dims[1] : 1),
input.dims[core_dim_num - 1],
};
const size_t ofm_num = output.dims[0];
for (size_t b2 = 0; b2 < tmp_batch_dims[2]; ++b2)
for (size_t b1 = 0; b1 < tmp_batch_dims[1]; ++b1)
for (size_t b0 = 0; b0 < tmp_batch_dims[0]; ++b0)
for (size_t ofm = 0; ofm < ofm_num; ++ofm)
{
int_fast32_t sum =
bias_present ? ownLoadValueAsRawInt(fmt, (char *)bias_ptr + bias.strides[0] * ofm) : 0;
for (size_t ifm = 0; ifm < tmp_input_dims[2]; ++ifm)
for (size_t y = 0; y < tmp_input_dims[1]; ++y)
for (size_t x = 0; x < tmp_input_dims[0]; ++x)
{
size_t weight_byte_offset = weight.strides[weight.dim_num-1] * ofm;
if (core_dim_num == 1)
{
weight_byte_offset += weight.strides[0] * ifm;
}
else if (weight.dim_num == 2)
{
const size_t count = x + tmp_input_dims[0] * (y + tmp_input_dims[1] * ifm);
weight_byte_offset += weight.strides[0] * count;
}
else
{
weight_byte_offset +=
weight.strides[2] * ifm +
weight.strides[1] * y +
weight.strides[0] * x;
}
const size_t input_byte_offset =
(batch_dim_num > 2 ? input.strides[core_dim_num + 2] * b2 : 0) +
(batch_dim_num > 1 ? input.strides[core_dim_num + 1] * b1 : 0) +
(batch_dim_num > 0 ? input.strides[core_dim_num + 0] * b0 : 0) +
input.strides[core_dim_num - 1] * ifm +
(core_dim_num == 3 ? input.strides[1] * y : 0) +
(core_dim_num == 3 ? input.strides[0] * x : 0);
const int_fast32_t w_val = ownLoadValueAsRawInt(fmt, (char *)weight_ptr + weight_byte_offset);
const int_fast32_t i_val = ownLoadValueAsRawInt(fmt, (char *)input_ptr + input_byte_offset);
// This is ok since all of them fit into int32_t
sum = ownApplyWrapRoundingToAccum(fmt, i_val * w_val, wrap, to_ne) + sum;
}
sum = ownWrapOrSat(fmt, sum, wrap);
const size_t output_byte_offset =
(batch_dim_num > 2 ? output.strides[3] * b2 : 0) +
(batch_dim_num > 1 ? output.strides[2] * b1 : 0) +
(batch_dim_num > 0 ? output.strides[1] * b0 : 0) +
output.strides[0] * ofm;
ownStoreRawIntValue(fmt, sum, (char *)output_ptr + output_byte_offset);
}
}
typedef struct
{
const char * name;
enum TestTensorDF fmt;
enum vx_convert_policy_e overflow_policy;
enum vx_round_policy_e rounding_policy;
vx_size core_dim;
vx_size weight_dim;
bool bias_present;
vx_size batch_dim;
} test_fully_connected_layer_arg;
#define TT_FULLYCONNECTED_CASES_BASE(NAME_,FMT_,OF_,ROUND_,CORE_DIMS_,W_DIMS_,BATCH_,BIAS_) \
ARG(NAME_"_COREDIMS_"#CORE_DIMS_"_WEIGHTDIMS_"#W_DIMS_"_BATCHDIMS_"#BATCH_, \
TT_##FMT_,VX_CONVERT_POLICY_##OF_, VX_ROUND_POLICY_TO_##ROUND_, \
CORE_DIMS_, W_DIMS_, BIAS_, BATCH_),
#define TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,CORE_DIMS_,W_DIM_,BATCH_) \
TT_FULLYCONNECTED_CASES_BASE(NAME_"_NOBIAS",FMT_,OF_,ROUND_,CORE_DIMS_,W_DIM_,BATCH_,0) \
TT_FULLYCONNECTED_CASES_BASE(NAME_"_BIAS",FMT_,OF_,ROUND_,CORE_DIMS_,W_DIM_,BATCH_,1)
#define TT_FULLYCONNECTED_CASES_2(NAME_,FMT_,OF_,ROUND_) \
TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,1,2,0) \
TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,1,2,1) \
TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,1,2,2) \
TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,1,2,3) \
TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,3,2,0) \
TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,3,2,1) \
TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,3,4,0) \
TT_FULLYCONNECTED_CASES_3(NAME_,FMT_,OF_,ROUND_,3,4,1)
#define TT_FULLYCONNECTED_CASES_1(NAME_,FMT_,OF_) \
TT_FULLYCONNECTED_CASES_2(NAME_"_ZERO",FMT_,OF_,ZERO) \
TT_FULLYCONNECTED_CASES_2(NAME_"_NE",FMT_,OF_,NEAREST_EVEN)
#define TT_FULLYCONNECTED_CASES_0(FMT_) \
TT_FULLYCONNECTED_CASES_1(#FMT_"_WRAP",FMT_,WRAP) \
TT_FULLYCONNECTED_CASES_1(#FMT_"_SAT",FMT_,SATURATE) \
#define TT_FULLYCONNECTED_CASES_ALL() \
TT_FULLYCONNECTED_CASES_0(U8)
TEST_WITH_ARG(TensorNN, testFullyConnectedLayer, test_fully_connected_layer_arg,
TT_FULLYCONNECTED_CASES_ALL()
)
{
assert (arg_->fmt == TT_Q78 || arg_->fmt == TT_U8 || arg_->fmt == TT_S8);
assert (arg_->overflow_policy == VX_CONVERT_POLICY_WRAP ||
arg_->overflow_policy == VX_CONVERT_POLICY_SATURATE);
assert (arg_->rounding_policy == VX_ROUND_POLICY_TO_ZERO ||
arg_->rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN);
assert ((arg_->core_dim == 1 && arg_->weight_dim == 2) ||
(arg_->core_dim == 3 && (arg_->weight_dim == 2 || arg_->weight_dim == 4)));
assert (arg_->batch_dim >= 0 && arg_->core_dim + arg_->batch_dim <= 4);
{ // TODO: ownTestGetMaxDims() ?
vx_size max_dims = 0;
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims >= 4);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type;
vx_uint8 fixed_point_position;
vx_size sizeof_data_type;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
const size_t in_dim_num = arg_->core_dim + arg_->batch_dim;
const size_t weight_dim_num = arg_->weight_dim;
const size_t bias_dim_num = arg_->bias_present;
const size_t out_dim_num = 1 + arg_->batch_dim;
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter);
fflush(stdout);
}
vx_size in_dims[4];
vx_size weight_dims[4];
vx_size bias_dims[1];
vx_size out_dims[4];
{
for (size_t i = 0; i < in_dim_num; ++i)
{
in_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
}
out_dims[0] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
for (size_t i = 0; i < arg_->batch_dim; ++i)
{
out_dims[i + 1] = in_dims[i + arg_->core_dim];
}
weight_dims[weight_dim_num-1] = out_dims[0];
if (arg_->core_dim == 1)
{
weight_dims[0] = in_dims[0];
}
else if (arg_->weight_dim == 2)
{
weight_dims[0] = in_dims[0] * in_dims[1] * in_dims[2];
}
else
{
weight_dims[0] = in_dims[0];
weight_dims[1] = in_dims[1];
weight_dims[2] = in_dims[2];
}
if (bias_dim_num) bias_dims[0] = out_dims[0];
}
vx_size in_strides[4];
vx_size weight_strides[4];
vx_size bias_strides[1];
vx_size out_strides[4];
ownGetFlatByteStrides(arg_->fmt, in_dims, in_dim_num, in_strides);
ownGetFlatByteStrides(arg_->fmt, weight_dims, weight_dim_num, weight_strides);
ownGetFlatByteStrides(arg_->fmt, bias_dims, bias_dim_num, bias_strides);
ownGetFlatByteStrides(arg_->fmt, out_dims, out_dim_num, out_strides);
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t in_dims: { "); for (size_t i = 0; i < in_dim_num; ++i) { printf("%zu, ", in_dims[i]); } printf(" }, \n");
printf("\t weight_dims: { "); for (size_t i = 0; i < weight_dim_num; ++i) { printf("%zu, ", weight_dims[i]); } printf(" }, \n");
if (bias_dim_num)
{
printf("\t bias_dims: { "); for (size_t i = 0; i < bias_dim_num; ++i) { printf("%zu, ", bias_dims[i]); } printf(" }, \n");
}
printf("\t out_dims: { "); for (size_t i = 0; i < out_dim_num; ++i) { printf("%zu, ", out_dims[i]); } printf(" }, \n");
printf("\t }\n");
}
const size_t in_bytes = in_dims[in_dim_num-1] * in_strides[in_dim_num-1];
const size_t weight_bytes = weight_dims[weight_dim_num-1] * weight_strides[weight_dim_num-1];
const size_t bias_bytes = bias_dim_num ? bias_dims[bias_dim_num-1] * bias_strides[bias_dim_num-1] : 0;
const size_t out_bytes = out_dims[out_dim_num-1] * out_strides[out_dim_num-1];
const size_t in_count = in_bytes / sizeof_data_type;
const size_t weight_count = bias_bytes / sizeof_data_type;
const size_t bias_count = bias_bytes / sizeof_data_type;
void * const in = malloc(in_bytes);
void * const weight = malloc(weight_bytes);
void * const bias = bias_dim_num ? malloc(bias_bytes) : NULL;
void * const out = malloc(out_bytes);
void * const refs = malloc(out_bytes);
ASSERT(in && weight && (!bias_dim_num || bias) && out && refs);
vx_tensor in_tensor = vxCreateTensor(context_->vx_context_, in_dim_num, in_dims, data_type, fixed_point_position);
vx_tensor weight_tensor = vxCreateTensor(context_->vx_context_, weight_dim_num, weight_dims, data_type, fixed_point_position);
vx_tensor bias_tensor =
bias_dim_num
? vxCreateTensor(context_->vx_context_, bias_dim_num, bias_dims, data_type, fixed_point_position)
: NULL;
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, out_dim_num, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(in_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(weight_tensor, VX_TYPE_TENSOR);
if (bias_dim_num) { ASSERT_VX_OBJECT(bias_tensor, VX_TYPE_TENSOR); }
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
{
size_t fc_prod_count = 1;
for (size_t i = 0; i < weight_dim_num - 1; ++ i)
{
fc_prod_count *= weight_dims[i];
}
ownFillSmallRandData(arg_->fmt, &rng, in_count, fc_prod_count, in);
ownFillSmallRandData(arg_->fmt, &rng, weight_count, fc_prod_count, weight);
if (bias_dim_num) { ownFillRandData(arg_->fmt, &rng, bias_count, bias); }
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(in_tensor, in_dim_num, view_start, in_dims, in_strides, in, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
VX_CALL(vxCopyTensorPatch(weight_tensor, weight_dim_num, view_start, weight_dims, weight_strides, weight, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
if (bias_dim_num)
{
VX_CALL(vxCopyTensorPatch(bias_tensor, bias_dim_num, view_start, bias_dims, bias_strides, bias, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
VX_CALL(vxCopyTensorPatch(out_tensor, out_dim_num, view_start, out_dims, out_strides, out, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
{
vx_graph graph = vxCreateGraph(context_->vx_context_);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_node node = vxFullyConnectedLayer(
graph,
in_tensor, weight_tensor, bias_tensor,
arg_->overflow_policy,
arg_->rounding_policy,
out_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
tensor_desc_t in_td = { in_dim_num, in_dims, in_strides };
tensor_desc_t weight_td = { weight_dim_num, weight_dims, weight_strides };
tensor_desc_t bias_td = { bias_dim_num, bias_dims, bias_strides };
tensor_desc_t out_td = { out_dim_num, out_dims, out_strides };
ownFullyConnected(
arg_->fmt,
in, in_td,
weight, weight_td,
bias, bias_td,
arg_->overflow_policy == VX_CONVERT_POLICY_WRAP,
arg_->rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN,
refs, out_td);
const vx_size view_start[4] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, out_dim_num, view_start, out_dims, out_strides, out, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out, out_dims, out_dim_num, out_strides,
refs, out_dims, out_dim_num, out_strides,
8, //0, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)refs + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&in_tensor));
VX_CALL(vxReleaseTensor(&weight_tensor));
if (bias_dim_num) { VX_CALL(vxReleaseTensor(&bias_tensor)); }
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, in_tensor);
EXPECT_EQ_PTR(NULL, weight_tensor);
EXPECT_EQ_PTR(NULL, bias_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
free(in);
free(weight);
free(bias);
free(out);
free(refs);
}
}
/****************************************************************************
* *
* Test vxPoolingLayer *
* *
***************************************************************************/
static void ownPooling(
enum TestTensorDF fmt,
const void * input_ptr, tensor_desc_t input,
bool max_pooling, // MAX vs AVG pooling
size_t size_x, size_t size_y,
size_t pad_x, size_t pad_y,
size_t stride_x, size_t stride_y,
void * output_ptr, tensor_desc_t output)
{
assert(input.dim_num == 3 || input.dim_num == 4);
assert(output.dim_num == input.dim_num);
const size_t input_w = input.dims[0];
const size_t input_h = input.dims[1];
const size_t input_c = input.dims[2];
const size_t input_b = input.dim_num > 3 ? input.dims[3] : 1;
const size_t output_w = output.dims[0];
const size_t output_h = output.dims[1];
const size_t output_c = output.dims[2];
const size_t output_b = output.dim_num > 3 ? output.dims[3] : 1;
assert(input_w + 2 * pad_x >= size_x);
assert(input_h + 2 * pad_y >= size_y);
// assert(missing_div_with_round_mode((input_w + 2 * pad_x - size_x), stride_x) + 1 == output_w);
// assert(missing_div_with_round_mode((input_h + 2 * pad_y - size_y), stride_y) + 1 == output_h);
//TODO: verify this is enforced by the input/output validators
assert(output_c == input_c);
assert(output_b == input_b);
// Since we calc offsets manually and cast to (int16_t *), we expect the-
// alignment to be correct already
ownAssertStridesModSizeof (fmt, input);
ownAssertStridesModSizeof (fmt, output);
//TODO: previously there was a 1d/3d stride for ofm but there's no 1D pool, right?
// Input and output pointers for the current batch being processed,
// Note: The compiler should've been able to hoist this out... And
// there's a bunch of other possible hoising iopportunities here.
const char * in_b_ptr = input_ptr;
char * out_b_ptr = output_ptr;
for (size_t b = 0; b < output_b; ++b, in_b_ptr += input.strides[3], out_b_ptr += output.strides[3])
for (size_t c = 0; c < output_c; ++c)
for (size_t y = 0; y < output_h; ++y)
for (size_t x = 0; x < output_w; ++x)
{
int32_t result = max_pooling ? ownGetMinValue(fmt) : 0;
const size_t xx_start = CLAMP(x * stride_x, pad_x, input_w + pad_x) - pad_x;
const size_t xx_after = CLAMP(x * stride_x + size_x, pad_x, input_w + pad_x) - pad_x;
const size_t yy_start = CLAMP(y * stride_y, pad_y, input_h + pad_y) - pad_y;
const size_t yy_after = CLAMP(y * stride_y + size_y, pad_y, input_h + pad_y) - pad_y;
for (size_t yy = yy_start; yy < yy_after; ++yy)
for (size_t xx = xx_start; xx < xx_after; ++xx)
{
const size_t input_byte_offset =
input.strides[2] * c +
input.strides[1] * yy +
input.strides[0] * xx;
const int32_t i_val = ownLoadValueAsRawInt(fmt, in_b_ptr + input_byte_offset);
result = max_pooling? MAX(result, i_val) : (result + i_val);
}
if (!max_pooling)
{
//result = conversion_24_8(result / (int16_t)(size_x * size_y));
result = CLAMP(result / (int32_t)(size_x * size_y), ownGetMinValue(fmt), ownGetMaxValue(fmt));
}
const size_t output_byte_offset =
output.strides[2] * c +
output.strides[1] * y +
output.strides[0] * x;
ownStoreRawIntValue(fmt, result, out_b_ptr + output_byte_offset);
}
}
typedef struct
{
const char * name;
enum TestTensorDF fmt;
enum vx_nn_pooling_type_e pooling_type;
vx_size size_x;
vx_size size_y;
vx_size padding_x;
vx_size padding_y;
enum vx_nn_rounding_type_e down_scale_size_rounding;
bool batching_dim;
} test_pooling_layer_arg;
#define TT_POOLING_CASES_BASE(NAME_,FMT_,TYPE_,ROUND_,SX_,SY_,PX_,PY_,BATCH_) \
ARG(#FMT_"_SIZE_X"#SX_"_Y"#SY_"_PAD_X"#PX_"_Y"#PY_"_"#TYPE_""NAME_, \
TT_##FMT_, VX_NN_POOLING_##TYPE_, SX_, SY_, PX_, PY_, \
VX_NN_DS_SIZE_ROUNDING_##ROUND_, BATCH_),
#define TT_POOLING_CASES_2(FMT_,TYPE_,ROUND_,SX_,SY_,PX_,PY_) \
TT_POOLING_CASES_BASE("",FMT_,TYPE_,ROUND_,SX_,SY_,PX_,PY_,0) \
TT_POOLING_CASES_BASE("_BATCH",FMT_,TYPE_,ROUND_,SX_,SY_,PX_,PY_,1)
#define TT_POOLING_CASES_1(FMT_,TYPE_,ROUND_) \
TT_POOLING_CASES_2(FMT_,TYPE_,ROUND_,3,4,1,2)
#define TT_POOLING_CASES_0(FMT_,TYPE_) \
TT_POOLING_CASES_1(FMT_,TYPE_,FLOOR) \
TT_POOLING_CASES_1(FMT_,TYPE_,CEILING)
#define TT_POOLING_CASES_EXTRA(FMT_) \
TT_POOLING_CASES_0(FMT_,MAX) \
TT_POOLING_CASES_0(FMT_,AVG)
#define TT_POOLING_CASES_ALEXNET(FMT_) \
TT_POOLING_CASES_2(FMT_,MAX,FLOOR,3,3,0,0)
#define TT_POOLING_CASES_ALL() \
TT_POOLING_CASES_ALEXNET(U8) \
TT_POOLING_CASES_EXTRA(U8)
TEST_WITH_ARG(TensorNN, testPoolingLayer, test_pooling_layer_arg,
TT_POOLING_CASES_ALL()
)
{
assert (arg_->fmt == TT_Q78 || arg_->fmt == TT_U8 || arg_->fmt == TT_S8);
assert (arg_->pooling_type == VX_NN_POOLING_MAX ||
arg_->pooling_type == VX_NN_POOLING_AVG);
assert (arg_->down_scale_size_rounding == VX_NN_DS_SIZE_ROUNDING_FLOOR ||
arg_->down_scale_size_rounding == VX_NN_DS_SIZE_ROUNDING_CEILING);
assert (arg_->batching_dim == 0 || arg_->batching_dim == 1);
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims >= (size_t)(3 + arg_->batching_dim));
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type;
vx_uint8 fixed_point_position;
vx_size sizeof_data_type;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
const size_t dim_num = 3 + arg_->batching_dim;
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter);
fflush(stdout);
}
size_t input_w, stride_x, output_w;
ownGetConvPoolRandParams(
&rng,
arg_->padding_x, arg_->size_x,
0 /* there's no dilation in pooling */,
arg_->down_scale_size_rounding == VX_NN_DS_SIZE_ROUNDING_CEILING,
&input_w, &stride_x, &output_w);
size_t input_h, stride_y, output_h;
ownGetConvPoolRandParams(
&rng,
arg_->padding_y, arg_->size_y,
0 /* there's no dilation in pooling */,
arg_->down_scale_size_rounding == VX_NN_DS_SIZE_ROUNDING_CEILING,
&input_h, &stride_y, &output_h);
const size_t chan = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
const size_t batch = arg_->batching_dim ? (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1) : 0;
const vx_size in_dims[4] = { input_w, input_h, chan, batch };
const vx_size out_dims[4] = { output_w, output_h, chan, batch };
vx_size in_strides[4];
vx_size out_strides[4];
ownGetFlatByteStrides(arg_->fmt, in_dims, dim_num, in_strides);
ownGetFlatByteStrides(arg_->fmt, out_dims, dim_num, out_strides);
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t in_dims: { "); for (size_t i = 0; i < dim_num; ++i) { printf("%zu, ", in_dims[i]); } printf(" }, \n");
printf("\t out_dims: { "); for (size_t i = 0; i < dim_num; ++i) { printf("%zu, ", out_dims[i]); } printf(" }, \n");
printf("\t }\n");
}
const size_t in_bytes = in_dims[dim_num-1] * in_strides[dim_num-1];
const size_t out_bytes = out_dims[dim_num-1] * out_strides[dim_num-1];
const size_t in_count = in_bytes / sizeof_data_type;
void * const in = malloc(in_bytes);
void * const out = malloc(out_bytes);
void * const refs = malloc(out_bytes);
ASSERT(in && out && refs);
vx_tensor in_tensor = vxCreateTensor(context_->vx_context_, dim_num, in_dims, data_type, fixed_point_position);
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, dim_num, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(in_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
{
// No real need to fo ownFillSmallRandData here because of the
// guranteed 32bit accum and our data counts being small.
ownFillRandData(arg_->fmt, &rng, in_count, in);
const vx_size view_start[4] = { 0 };
VX_CALL(vxCopyTensorPatch(in_tensor, dim_num, view_start, in_dims, in_strides, in, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
{
vx_graph graph = vxCreateGraph(context_->vx_context_);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_node node = vxPoolingLayer(
graph, in_tensor, arg_->pooling_type,
arg_->size_x, arg_->size_y,
arg_->padding_x, arg_->padding_y,
arg_->down_scale_size_rounding,
out_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
tensor_desc_t in_td = { dim_num, in_dims, in_strides };
tensor_desc_t out_td = { dim_num, out_dims, out_strides };
ownPooling(
arg_->fmt,
in, in_td,
arg_->pooling_type == VX_NN_POOLING_MAX,
arg_->size_x, arg_->size_y,
arg_->padding_x, arg_->padding_y,
stride_x, stride_y,
refs, out_td);
const vx_size view_start[4] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, dim_num, view_start, out_dims, out_strides, out, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out, out_dims, dim_num, out_strides,
refs, out_dims, dim_num, out_strides,
8, //0, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)refs + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&in_tensor));
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, in_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
free(in);
free(out);
free(refs);
}
}
/****************************************************************************
* *
* Test vxSoftmaxLayer *
* *
***************************************************************************/
static void ownSoftmax(
enum TestTensorDF fmt,
const void * input_ptr, tensor_desc_t input,
void * output_ptr, tensor_desc_t output)
{
//TODO: @Tomer, should we allow extra batch dims beyond 4? conv and poll have upto 3 of them! if not we can just discard this define and its usage
#define SOFTMAX_ALLOW_EXTRA_DIMS
#ifdef SOFTMAX_ALLOW_EXTRA_DIMS
assert(input.dim_num >= 1 && input.dim_num <= 4);
#else
assert(input.dim_num >= 1 && input.dim_num < MAX_NUM_OF_DIMENSIONS);
#endif
assert(input.dim_num == output.dim_num);
// Since we calc offsets manually and cast to (int16_t *), we expect the-
// alignment to be correct already
ownAssertStridesModSizeof (fmt, input);
ownAssertStridesModSizeof (fmt, output);
// We precalc and store the key (summation) index and the rest of the dims
// which describe batching before the main loop for clarity, since the later may be partially shifted depending on the key dim.
size_t key_sz = 0;
size_t key_in_stride = 0;
#ifdef SOFTMAX_ALLOW_EXTRA_DIMS
size_t batch_sz[5] = { 1, 1, 1, 1, 1 };
size_t batch_in_strides[5] = { 0 };
size_t batch_out_strides[5] = { 0 };
#else
size_t batch_sz[3] = { 1, 1, 1 };
size_t batch_in_strides[3] = { 0 };
size_t batch_out_strides[3] = { 0 };
#endif
#if 1
{
size_t key = input.dim_num > 2 ? 2 : 0;
key_sz = input.dims[key];
key_in_stride = input.strides[key];
for (size_t i = 0; i < input.dim_num - 1; ++i)
{
size_t idx = i < key ? i : i + 1;
batch_sz[i] = input.dims[idx];
batch_in_strides[i] = input.strides[idx];
batch_out_strides[i] = output.strides[idx];
}
}
#else
switch (input.dim_num)
{
#ifdef SOFTMAX_ALLOW_EXTRA_DIMS
case 6:
batch_sz[4] = input.dims[5];
batch_in_strides[4] = input.strides[5];
batch_out_strides[4] = output.strides[5];
/* fallthrough */
case 5:
batch_sz[3] = input.dims[4];
batch_in_strides[3] = input.strides[4];
batch_out_strides[3] = output.strides[4];
/* fallthrough */
#endif
case 4:
batch_sz[2] = input.dims[3];
batch_in_strides[2] = input.strides[3];
batch_out_strides[2] = output.strides[3];
/* fallthrough */
case 3:
key_sz = input.dims[2];
key_in_stride = input.strides[2];
batch_sz[1] = input.dims[1];
batch_in_strides[1] = input.strides[1];
batch_out_strides[1] = output.strides[1];
batch_sz[0] = input.dims[0];
batch_in_strides[0] = input.strides[0];
batch_out_strides[0] = output.strides[0];
break;
case 2:
batch_sz[0] = input.dims[1];
batch_in_strides[0] = input.strides[1];
batch_out_strides[0] = output.strides[1];
/* fallthrough */
case 1:
key_sz = input.dims[0];
key_in_stride = input.strides[0];
break;
default:
assert(0);
}
#endif
// The main loop calculation can be done with a double accumulator, float with
// value normalization (exp(val-max_val)) to avoid getting to inf or plain -
// float. Leaving all 3 here for result comparision, since the spec has nothing
// about required accumulator width.
//
// Note: For U8, S8 all 3 will result in the same results. But for Q78, because
// summing exp(127) is quite large for a single precision float, using it
// may already result in inf within the summation causing all values to
// 0 after softmax! And obviously for F32, the change of getting there is
// even higher.
//
// Set to 0 for float, 1 for double, 2 for float with norm.
#define SOFTMAX_ACCUM_TYPE 0
#ifdef SOFTMAX_ALLOW_EXTRA_DIMS
for (size_t b4 = 0; b4 < batch_sz[4]; ++b4)
for (size_t b3 = 0; b3 < batch_sz[3]; ++b3)
#endif
for (size_t b2 = 0; b2 < batch_sz[2]; ++b2)
for (size_t b1 = 0; b1 < batch_sz[1]; ++b1)
for (size_t b0 = 0; b0 < batch_sz[0]; ++b0)
{
// Input and output pointers for the current batch being processed.
const char * in_b_ptr = (char*)input_ptr +
batch_in_strides[2] * b2 +
batch_in_strides[1] * b1 +
batch_in_strides[0] * b0;
char * out_b_ptr = (char*)output_ptr +
batch_out_strides[2] * b2 +
batch_out_strides[1] * b1 +
batch_out_strides[0] * b0;
#ifdef SOFTMAX_ALLOW_EXTRA_DIMS
in_b_ptr += batch_in_strides[4] * b4 + batch_in_strides[3] * b3;
out_b_ptr += batch_out_strides[4] * b4 + batch_out_strides[3] * b3;
#endif
#if SOFTMAX_ACCUM_TYPE == 0
float sum = 0.f;
for (size_t i = 0; i < key_sz; ++i)
{
const int_fast32_t in = ownLoadValueAsRawInt(fmt, in_b_ptr + key_in_stride * i);
float in_val = ownUnquantize(fmt, in);
sum += expf(in_val);
}
for (size_t i = 0; i < key_sz; ++i)
{
const int_fast32_t in = ownLoadValueAsRawInt(fmt, in_b_ptr + key_in_stride * i);
float in_val = ownUnquantize(fmt, in);
ownStoreRawIntValue(fmt, ownQuantize(fmt, expf(in_val) / sum), out_b_ptr + key_in_stride * i);
}
#elif SOFTMAX_ACCUM_TYPE == 1
double sum = 0.;
for (size_t i = 0; i < key_sz; ++i)
{
const int16_t * in_ptr = (int16_t *)(in_b_ptr + key_in_stride * i);
float in_val = UNQUANTIZE(*in_ptr);
sum += exp(in_val);
}
for (size_t i = 0; i < key_sz; ++i)
{
const int16_t * in_ptr = (int16_t *)(in_b_ptr + key_in_stride * i);
float in_val = UNQUANTIZE(*in_ptr);
int16_t * out_ptr = (int16_t *)(out_b_ptr + key_in_stride * i);
*out_ptr = QUANTIZE(exp(in_val) / sum);
}
#elif SOFTMAX_ACCUM_TYPE == 2
float max_val = -FLT_MAX;
float sum = 0.f;
for (size_t i = 0; i < key_sz; ++i)
{
const int16_t * in_ptr = (int16_t *)(in_b_ptr + key_in_stride * i);
float in_val = UNQUANTIZE(*in_ptr);
max_val = MAX(max_val, in_val);
}
// Note: It may be benificial to cache the exponents
for (size_t i = 0; i < key_sz; ++i)
{
const int16_t * in_ptr = (int16_t *)(in_b_ptr + key_in_stride * i);
float in_val = UNQUANTIZE(*in_ptr);
sum += expf(in_val - max_val);
}
for (size_t i = 0; i < key_sz; ++i)
{
const int16_t * in_ptr = (int16_t *)(in_b_ptr + key_in_stride * i);
float in_val = UNQUANTIZE(*in_ptr);
int16_t * out_ptr = (int16_t *)(out_b_ptr + key_in_stride * i);
*out_ptr = QUANTIZE(expf(in_val - max_val) / sum);
}
#else
#error SOFTMAX_ACCUM_TYPE must be 0..2
#endif
}
}
typedef struct
{
const char * name;
enum TestTensorDF fmt;
vx_size dim_num;
} test_softmax_layer_arg;
#define TT_SOFTMAX_CASES_BASE(FMT_,DIMS_) \
ARG(#FMT_"_DIMS"#DIMS_, TT_##FMT_, DIMS_),
#define TT_SOFTMAX_CASES_0(FMT_) \
TT_SOFTMAX_CASES_BASE(FMT_,1) \
TT_SOFTMAX_CASES_BASE(FMT_,2) \
TT_SOFTMAX_CASES_BASE(FMT_,3) \
TT_SOFTMAX_CASES_BASE(FMT_,4)
#define TT_SOFTMAX_CASES_ALL() \
TT_SOFTMAX_CASES_0(U8)
TEST_WITH_ARG(TensorNN, testSoftmaxLayer, test_softmax_layer_arg,
TT_SOFTMAX_CASES_ALL()
)
{
assert (arg_->fmt == TT_Q78 || arg_->fmt == TT_U8 || arg_->fmt == TT_S8);
assert (arg_->dim_num >= 1 && arg_->dim_num <=4);
{ // TODO: ownTestGetMaxDims() ?
vx_size max_dims = 0;
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims >= arg_->dim_num);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type;
vx_uint8 fixed_point_position;
vx_size sizeof_data_type;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter);
fflush(stdout);
}
size_t dims[4];
for (size_t i = 0; i < arg_->dim_num; ++i)
{
dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
}
size_t strides[4];
ownGetFlatByteStrides(arg_->fmt, dims, arg_->dim_num, strides);
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: { dims: { ");
for (size_t i = 0; i < arg_->dim_num; ++i) { printf("%zu, ", dims[i]); }
printf(" } }\n");
}
const size_t bytes = dims[arg_->dim_num-1] * strides[arg_->dim_num-1];
const size_t count = bytes / sizeof_data_type;
void * const in = malloc(bytes);
void * const out = malloc(bytes);
void * const refs = malloc(bytes);
ASSERT(in && out && refs);
vx_tensor in_tensor = vxCreateTensor(context_->vx_context_, arg_->dim_num, dims, data_type, fixed_point_position);
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, arg_->dim_num, dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(in_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
{
// No real need to fo ownFillSmallRandData here because of the
// guranteed 32bit accum and our data counts being small.
ownFillRandData(arg_->fmt, &rng, count, in);
const vx_size view_start[4] = { 0 };
VX_CALL(vxCopyTensorPatch(in_tensor, arg_->dim_num, view_start, dims, strides, in, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
{
vx_graph graph = vxCreateGraph(context_->vx_context_);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_node node = vxSoftmaxLayer(graph, in_tensor, out_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
tensor_desc_t td = { arg_->dim_num, dims, strides };
ownSoftmax(arg_->fmt, in, td, refs, td);
const vx_size view_start[4] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, arg_->dim_num, view_start, dims, strides, out, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out, dims, arg_->dim_num, strides,
refs, dims, arg_->dim_num, strides,
8, //0, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)refs + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&in_tensor));
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, in_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
free(in);
free(out);
free(refs);
}
}
/****************************************************************************
* *
* test vxNormalizationlayer *
* *
***************************************************************************/
//static void ownNormalization() { /*TODO*/ }
//
//typedef struct
//{
// const char * name;
//
// enum TestTensorDF fmt;
//} test_normalization_layer_arg;
//
//TEST_WITH_ARG(TensorNN, testNormalizationLayer, test_normalization_layer_arg,
// ARG("Q78", TT_Q78),
//)
//{
//}
/****************************************************************************
* *
* test vxActivationLayer *
* *
***************************************************************************/
static void ownActivation(
enum TestTensorDF fmt,
const void * input_ptr, tensor_desc_t input,
enum vx_nn_activation_function_e func,
float a, float b,
void * output_ptr, tensor_desc_t output)
{
assert (func == VX_NN_ACTIVATION_LOGISTIC ||
func == VX_NN_ACTIVATION_HYPERBOLIC_TAN ||
func == VX_NN_ACTIVATION_RELU ||
func == VX_NN_ACTIVATION_BRELU ||
func == VX_NN_ACTIVATION_SOFTRELU ||
func == VX_NN_ACTIVATION_ABS ||
func == VX_NN_ACTIVATION_SQUARE ||
func == VX_NN_ACTIVATION_SQRT ||
func == VX_NN_ACTIVATION_LINEAR);
assert (input.dim_num == output.dim_num);
assert (input.dim_num > 0 && input.dim_num <= 4);
for (size_t i = 0; i < input.dim_num; ++i)
{
assert (input.dims[i] == output.dims[i]);
}
ownAssertStridesModSizeof(fmt, input);
ownAssertStridesModSizeof(fmt, output);
const size_t dim0 = output.dims[0];
const size_t dim1 = output.dim_num > 1 ? output.dims[1]: 1;
const size_t dim2 = output.dim_num > 2 ? output.dims[2]: 1;
const size_t dim3 = output.dim_num > 3 ? output.dims[3]: 1;
for (size_t i3 = 0; i3 < dim3; ++i3)
for (size_t i2 = 0; i2 < dim2; ++i2)
for (size_t i1 = 0; i1 < dim1; ++i1)
for (size_t i0 = 0; i0 < dim0; ++i0)
{
const size_t input_byte_offset =
(input.dim_num > 3 ? input.strides[3] * i3 : 0) +
(input.dim_num > 2 ? input.strides[2] * i2 : 0) +
(input.dim_num > 1 ? input.strides[1] * i1 : 0) +
input.strides[0] * i0;
int_fast32_t val = ownLoadValueAsRawInt(fmt, (char*)input_ptr + input_byte_offset);
//TODO: should we check that val is a legal input for the functoin?
switch(func)
{
case VX_NN_ACTIVATION_LOGISTIC:
val = (int)(1 / (1 + exp(val))); //TODO: conversion issue?
break;
case VX_NN_ACTIVATION_HYPERBOLIC_TAN:
val = (int)(a * tanh(b * val)); //TODO: conversion issue?
break;
case VX_NN_ACTIVATION_RELU:
val = MAX(0, val);
break;
case VX_NN_ACTIVATION_BRELU:
val = MIN(a, MAX(0, val)); //TODO: conversion issue?
break;
case VX_NN_ACTIVATION_SOFTRELU:
val = log(1 + exp(val)); //TODO: conversion issue?
break;
case VX_NN_ACTIVATION_ABS:
val = val < 0 ? - val : val;
break;
case VX_NN_ACTIVATION_SQUARE:
val = val * val;
break;
case VX_NN_ACTIVATION_SQRT:
val = sqrt(val); //TODO: conversoin issue?
break;
case VX_NN_ACTIVATION_LINEAR:
val = a * val + b;
break;
default:
assert(0);
}
const size_t output_byte_offset =
(output.dim_num > 3 ? output.strides[3] * i3 : 0) +
(output.dim_num > 2 ? output.strides[2] * i2 : 0) +
(output.dim_num > 1 ? output.strides[1] * i1 : 0) +
output.strides[0] * i0;
val = ownWrapOrSat(fmt, val, false); //TODO: what should be done here??
ownStoreRawIntValue(fmt, val, (char*)output_ptr + output_byte_offset);
}
}
typedef struct
{
const char * name;
enum TestTensorDF fmt;
vx_size dim_num;
enum vx_nn_activation_function_e func;
vx_float32 a;
vx_float32 b;
} test_activation_layer_arg;
#define TT_ACTIVATION_CASES_BASE(NAME_,FMT_,DIMS_,FUNC_,A_,B_) \
ARG(#FMT_"_DIMS"#DIMS_"_"#FUNC_""NAME_, \
TT_##FMT_, DIMS_, VX_NN_ACTIVATION_##FUNC_, A_, B_),
//TODO: what do we want to test here???
#define TT_ACTIVATION_CASES_1(FMT_,DIM_) \
TT_ACTIVATION_CASES_BASE("",FMT_,DIM_,LOGISTIC,0,0) \
TT_ACTIVATION_CASES_BASE("_A1_B1",FMT_,DIM_,HYPERBOLIC_TAN,1,1) \
TT_ACTIVATION_CASES_BASE("_A2_B2",FMT_,DIM_,HYPERBOLIC_TAN,2,2) \
TT_ACTIVATION_CASES_BASE("",FMT_,DIM_,RELU,0,0)
#ifdef ACTIVATION_EXTRA
TT_ACTIVATION_CASES_BASE("_A50",FMT_,DIM_,BRELU,50,0) \
TT_ACTIVATION_CASES_BASE("",FMT_,DIM_,SOFTRELU,0,0) \
TT_ACTIVATION_CASES_BASE("",FMT_,DIM_,ABS,0,0) \
TT_ACTIVATION_CASES_BASE("",FMT_,DIM_,SQUARE,0,0) \
TT_ACTIVATION_CASES_BASE("",FMT_,DIM_,SQRT,0,0) \
TT_ACTIVATION_CASES_BASE("",FMT_,DIM_,LINEAR,1,0) \
TT_ACTIVATION_CASES_BASE("_Ahalf_B2",FMT_,DIM_,LINEAR,.5f,2)
#endif //ACTIVATION_EXTRA
#define TT_ACTIVATION_CASES_0(FMT_) \
TT_ACTIVATION_CASES_1(FMT_,1) \
TT_ACTIVATION_CASES_1(FMT_,2) \
TT_ACTIVATION_CASES_1(FMT_,3) \
TT_ACTIVATION_CASES_1(FMT_,4)
#define TT_ACTIVATION_CASES_ALL() \
TT_ACTIVATION_CASES_0(U8)
TEST_WITH_ARG(TensorNN, testActivationLayer, test_activation_layer_arg,
TT_ACTIVATION_CASES_ALL()
)
{
assert (arg_->fmt == TT_Q78 || arg_->fmt == TT_U8 || arg_->fmt == TT_S8);
assert (arg_->dim_num >= 1 && arg_->dim_num <= 4);
assert (arg_->func == VX_NN_ACTIVATION_LOGISTIC ||
arg_->func == VX_NN_ACTIVATION_HYPERBOLIC_TAN ||
arg_->func == VX_NN_ACTIVATION_RELU ||
arg_->func == VX_NN_ACTIVATION_BRELU ||
arg_->func == VX_NN_ACTIVATION_SOFTRELU ||
arg_->func == VX_NN_ACTIVATION_ABS ||
arg_->func == VX_NN_ACTIVATION_SQUARE ||
arg_->func == VX_NN_ACTIVATION_SQRT ||
arg_->func == VX_NN_ACTIVATION_LINEAR);
assert (arg_->a >= 0.f && arg_->b >= 0.f);
{ // TODO: ownTestGetMaxDims() ?
vx_size max_dims = 0;
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims >= arg_->dim_num);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type;
vx_uint8 fixed_point_position;
vx_size sizeof_data_type;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter);
fflush(stdout);
}
size_t dims[4];
for (size_t i = 0; i < arg_->dim_num; ++i)
{
dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
}
size_t strides[4];
ownGetFlatByteStrides(arg_->fmt, dims, arg_->dim_num, strides);
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: { dims: { ");
for (size_t i = 0; i < arg_->dim_num; ++i) { printf("%zu, ", dims[i]); }
printf(" } }\n");
}
const size_t bytes = dims[arg_->dim_num-1] * strides[arg_->dim_num-1];
const size_t count = bytes / sizeof_data_type;
void * const in = malloc(bytes);
void * const out = malloc(bytes);
void * const refs = malloc(bytes);
ASSERT(in && out && refs);
vx_tensor in_tensor = vxCreateTensor(context_->vx_context_, arg_->dim_num, dims, data_type, fixed_point_position);
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, arg_->dim_num, dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(in_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
{
ownFillRandData(arg_->fmt, &rng, count, in);
const vx_size view_start[4] = { 0 };
VX_CALL(vxCopyTensorPatch(in_tensor, arg_->dim_num, view_start, dims, strides, in, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
{
vx_graph graph = vxCreateGraph(context_->vx_context_);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_node node = vxActivationLayer(graph, in_tensor, arg_->func, arg_->a, arg_->b, out_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
tensor_desc_t td = { arg_->dim_num, dims, strides };
ownActivation(arg_->fmt, in, td, arg_->func, arg_->a, arg_->b, refs, td);
const vx_size view_start[4] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, arg_->dim_num, view_start, dims, strides, out, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out, dims, arg_->dim_num, strides,
refs, dims, arg_->dim_num, strides,
8, //0, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)refs + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&in_tensor));
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, in_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
free(in);
free(out);
free(refs);
}
}
/****************************************************************************
* *
* Test vxROIPoolingLayer *
* *
***************************************************************************/
static void ownROIPooling(
enum TestTensorDF fmt,
const void * data, vx_size data_dim_num, const vx_size * data_dims, const vx_size * data_strides,
const void * rois, vx_size rois_dim_num, const vx_size * rois_dims, const vx_size * rois_strides,
void * out, vx_size out_dim_num, const vx_size * out_dims, const vx_size * out_strides)
{
assert ((data_dim_num == 3 && rois_dim_num == 2 && out_dim_num == 4) ||
(data_dim_num == 4 && rois_dim_num == 3 && out_dim_num == 5));
// format: [batch][channels][height][width]
const size_t data_w = data_dims[0];
const size_t data_h = data_dims[1];
const size_t data_c = data_dims[2];
const size_t data_b = data_dim_num == 4 ? data_dims[3] : 1;
// format: [batch][roi_count][4]
const size_t rois_d = rois_dims[0];
const size_t rois_r = rois_dims[1];
const size_t rois_b = rois_dim_num == 3 ? rois_dims[2] : 1;
// format: [batch][roi_count][channels][height][width]
const size_t out_w = out_dims[0];
const size_t out_h = out_dims[1];
const size_t out_c = out_dims[2];
const size_t out_r = out_dims[3];
const size_t out_b = out_dim_num == 5 ? out_dims[4] : 1;
assert(data_c == out_c);
assert(data_b == rois_b && data_b == out_b);
assert(rois_d == 4);
assert(rois_r == out_r);
{
size_t sizeof_data_type = 0;
switch(fmt)
{
case TT_Q78: sizeof_data_type = sizeof(vx_int16); break;
case TT_U8: sizeof_data_type = sizeof(vx_uint8); break;
case TT_S8: sizeof_data_type = sizeof(vx_int8); break;
default: assert(0);
}
for (size_t i = 0; i < data_dim_num; ++i) { assert(data_strides[i] % sizeof_data_type == 0); }
for (size_t i = 0; i < rois_dim_num; ++i) { assert(rois_strides[i] % sizeof_data_type == 0); }
for (size_t i = 0; i < out_dim_num; ++i) { assert(out_strides[i] % sizeof_data_type == 0); }
}
const int_fast32_t lowest_val = ownGetMinValue(fmt);
for (size_t b = 0; b < out_b; ++b)
for (size_t r = 0; r < out_r; ++r)
for (size_t c = 0; c < out_c; ++c)
for (size_t y = 0; y < out_h; ++y)
for (size_t x = 0; x < out_w; ++x)
{
const char * roi_b_ptr = (char*)rois + rois_strides[1] * r + (b ? rois_strides[2] * b : 0);
const int roi_x0 = ownLoadValueAsRawInt(fmt, roi_b_ptr + rois_strides[0] * 0);
const int roi_y0 = ownLoadValueAsRawInt(fmt, roi_b_ptr + rois_strides[0] * 1);
const int roi_x1 = ownLoadValueAsRawInt(fmt, roi_b_ptr + rois_strides[0] * 2);
const int roi_y1 = ownLoadValueAsRawInt(fmt, roi_b_ptr + rois_strides[0] * 3);
// The final coordinate is within the ROI => +1
// And we treat malformed dimensions as 1
const int roi_w = MAX(roi_x1 - roi_x0, 0) + 1;
const int roi_h = MAX(roi_y1 - roi_y0, 0) + 1;
// Note that "after" is rounded up else we get the last cell,
// instead of the cell beyond.
//
// For ex. with src being a 6 cell row and dst being a 4 cell one:
// >>> [((x + 0) * 6) // 4 for x in range(4)] # "begin" values
// [0, 1, 3, 4] # as expected
// >>> [((x + 1) * 6) // 4 for x in range(4)] # "after" values
// [1, 3, 4, 6] # [2, 3, 5, 6] expected!
const int dx_begin = ((x + 0) * roi_w) / out_w;
const int dy_begin = ((y + 0) * roi_h) / out_h;
const int dx_after = ((x + 1) * roi_w + (out_w - 1)) / out_w;
const int dy_after = ((y + 1) * roi_h + (out_h - 1)) / out_h;
// clamp in case roi_x or roi_y were unreasonable
const int x_begin = CLAMP((size_t)(roi_x0 + dx_begin), 0, data_w);
const int y_begin = CLAMP((size_t)(roi_y0 + dy_begin), 0, data_h);
const int x_after = CLAMP((size_t)(roi_x0 + dx_after), 0, data_w);
const int y_after = CLAMP((size_t)(roi_y0 + dy_after), 0, data_h);
const char * data_b_ptr = (char*)data + data_strides[3] * b + data_strides[2] * c;
// If there's no values for the current roi, we default to 0
const bool non_empty = (x_begin < x_after && y_begin < y_after);
int res = non_empty ? lowest_val : 0;
for (int yy = y_begin; yy < y_after; ++yy)
for (int xx = x_begin; xx < x_after; ++xx)
{
const void * val_ptr = data_b_ptr + data_strides[1] * yy + data_strides[0] * xx;
int val = ownLoadValueAsRawInt(fmt, val_ptr);
res = MAX(res, val);
}
const size_t output_byte_offset =
out_strides[4] * b +
out_strides[3] * r + out_strides[2] * c +
out_strides[1] * y + out_strides[0] * x;
ownStoreRawIntValue(fmt, res, (char*)out + output_byte_offset);
}
}
typedef struct
{
const char * name;
enum TestTensorDF fmt;
bool with_batching;
} test_roi_pooling_arg;
TEST_WITH_ARG(TensorNN, testROIPoolingLayer, test_roi_pooling_arg,
ARG("Q78", TT_Q78, false),
ARG("U8", TT_U8, false),
ARG("S8", TT_S8, false),
ARG("Q78_Bathcing", TT_Q78, true),
ARG("U8_Batching", TT_U8, true),
ARG("S8_Batching", TT_S8, true),
)
{
assert(arg_->fmt == TT_Q78 || arg_->fmt == TT_U8 || arg_->fmt == TT_S8);
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims >= (size_t)(arg_->with_batching ? 5 : 4));
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type;
vx_uint8 fixed_point_position;
vx_size sizeof_data_type;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
const size_t data_dim_num = arg_->with_batching ? 4 : 3;
const size_t rois_dim_num = arg_->with_batching ? 3 : 2;
const size_t out_dim_num = arg_->with_batching ? 5 : 4;
size_t * const data_dims = malloc(sizeof(*data_dims) * data_dim_num);
size_t * const rois_dims = malloc(sizeof(*rois_dims) * rois_dim_num);
size_t * const out_dims = malloc(sizeof(*out_dims) * out_dim_num);
ASSERT(data_dims && rois_dims && out_dims);
size_t * const data_strides = malloc(sizeof(*data_strides) * data_dim_num);
size_t * const rois_strides = malloc(sizeof(*rois_strides) * rois_dim_num);
size_t * const out_strides = malloc(sizeof(*out_strides) * out_dim_num);
ASSERT(data_strides && rois_strides && out_strides);
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter);
fflush(stdout);
}
for (vx_size i = 0; i < data_dim_num; ++i)
{
data_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
data_strides[i] = i ? data_strides[i-1] * data_dims[i-1] : sizeof_data_type;
}
rois_dims[0] = 4;
rois_dims[1] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
out_dims[0] = data_dims[0];
out_dims[1] = data_dims[1];
out_dims[2] = data_dims[2];
out_dims[3] = rois_dims[1];
if (arg_->with_batching)
{
out_dims[4] = rois_dims[2] = data_dims[3];
}
vx_tensor data_tensor = vxCreateTensor(context_->vx_context_, data_dim_num, data_dims, data_type, fixed_point_position);
vx_tensor rois_tensor = vxCreateTensor(context_->vx_context_, rois_dim_num, rois_dims, data_type, fixed_point_position);
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, out_dim_num, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(data_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(rois_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
for (size_t i = 0; i < rois_dim_num; ++i)
{
rois_strides[i] = i ? rois_strides[i-1] * rois_dims[i-1] : sizeof_data_type;
}
for (size_t i = 0; i < out_dim_num; ++i)
{
out_strides[i] = i ? out_strides[i-1] * out_dims[i-1] : sizeof_data_type;
}
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t data_dims: { "); for (size_t i = 0; i < data_dim_num; ++i) { printf("%zu, ", data_dims[i]); } printf(" }, \n");
printf("\t rois_dims: { "); for (size_t i = 0; i < rois_dim_num; ++i) { printf("%zu, ", rois_dims[i]); } printf(" }, \n");
printf("\t out_dims: { "); for (size_t i = 0; i < out_dim_num; ++i) { printf("%zu, ", out_dims[i]); } printf(" }, \n");
printf("\t }\n");
}
const size_t data_bytes = data_dims[data_dim_num-1] * data_strides[data_dim_num-1];
const size_t rois_bytes = rois_dims[rois_dim_num-1] * rois_strides[rois_dim_num-1];
const size_t out_bytes = out_dims[out_dim_num-1] * out_strides[out_dim_num-1];
const size_t data_count = data_bytes / sizeof_data_type;
void * const data = malloc(data_bytes);
void * const rois = malloc(rois_bytes);
void * const out = malloc(out_bytes);
void * const refs = malloc(out_bytes);
ASSERT(data && rois && out && refs);
{
ownFillRandData(arg_->fmt, &rng, data_count, data);
for (size_t i = 0; i < rois_dims[1]; ++i)
{
switch(arg_->fmt)
{
case TT_Q78:
((vx_int16*)rois)[4*i+0] = (vx_int16)CT_RNG_NEXT_INT(rng, -2, data_dims[0] + 2);
((vx_int16*)rois)[4*i+1] = (vx_int16)CT_RNG_NEXT_INT(rng, -2, data_dims[1] + 2);
((vx_int16*)rois)[4*i+2] = (vx_int16)CT_RNG_NEXT_INT(rng, -2, data_dims[0] + 2);
((vx_int16*)rois)[4*i+3] = (vx_int16)CT_RNG_NEXT_INT(rng, -2, data_dims[1] + 2);
break;
case TT_U8:
((vx_uint8*)rois)[4*i+0] = (vx_uint8)CT_RNG_NEXT_INT(rng, 0, data_dims[0] + 2);
((vx_uint8*)rois)[4*i+1] = (vx_uint8)CT_RNG_NEXT_INT(rng, 0, data_dims[1] + 2);
((vx_uint8*)rois)[4*i+2] = (vx_uint8)CT_RNG_NEXT_INT(rng, 0, data_dims[0] + 2);
((vx_uint8*)rois)[4*i+3] = (vx_uint8)CT_RNG_NEXT_INT(rng, 0, data_dims[1] + 2);
break;
case TT_S8:
((vx_int8*)rois)[4*i+0] = (vx_int8)CT_RNG_NEXT_INT(rng, -2, data_dims[0] + 2);
((vx_int8*)rois)[4*i+1] = (vx_int8)CT_RNG_NEXT_INT(rng, -2, data_dims[1] + 2);
((vx_int8*)rois)[4*i+2] = (vx_int8)CT_RNG_NEXT_INT(rng, -2, data_dims[0] + 2);
((vx_int8*)rois)[4*i+3] = (vx_int8)CT_RNG_NEXT_INT(rng, -2, data_dims[1] + 2);
break;
default:
assert(0);
}
}
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(data_tensor, data_dim_num, view_start, data_dims, data_strides, data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
VX_CALL(vxCopyTensorPatch(rois_tensor, rois_dim_num, view_start, rois_dims, rois_strides, rois, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
vx_graph graph = vxCreateGraph(context_->vx_context_);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
const vx_nn_roi_pool_params_t roi_pool_params = { VX_NN_POOLING_MAX };
vx_node node = vxROIPoolingLayer(graph, data_tensor, rois_tensor, &roi_pool_params, sizeof(roi_pool_params), out_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
ownROIPooling(
arg_->fmt,
data, data_dim_num, data_dims, data_strides,
rois, rois_dim_num, rois_dims, rois_strides,
refs, out_dim_num, out_dims, out_strides);
const size_t view_start[5] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, out_dim_num, view_start, out_dims, out_strides, out, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out, out_dims, 2, out_strides,
refs, out_dims, 2, out_strides,
8, //0, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)refs + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&data_tensor));
VX_CALL(vxReleaseTensor(&rois_tensor));
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, data_tensor);
EXPECT_EQ_PTR(NULL, rois_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
free(data);
free(rois);
free(out);
free(refs);
}
free(data_dims);
free(rois_dims);
free(out_dims);
free(data_strides);
free(rois_strides);
free(out_strides);
}
/****************************************************************************
* *
* test vxDeconvolutionLayer *
* *
***************************************************************************/
static void ownGetDeconvRandParams(
uint64_t * rng,
size_t pad_sz, size_t kernel_sz,
size_t a,
/*OUT*/ size_t * input_sz,
/*OUT*/ size_t * upscale,
/*OUT*/ size_t * output_sz)
{
*upscale = (size_t)CT_RNG_NEXT_INT(*rng, a + 1, a + 3);
const int tmp = (2 * (int)pad_sz - (int)kernel_sz - (int)a + ((int)*upscale - 1)) /(int)*upscale;
const int min_input = 2 + MAX(tmp, 0); //TODO: can we lower this?
const int max_input = MIN(min_input, TEST_TENSOR_MAX_DIM_SZ) + 5;
*input_sz = (size_t)CT_RNG_NEXT_INT(*rng, min_input, max_input);
*output_sz = (*input_sz - 1) * *upscale + kernel_sz + a - 2 * pad_sz;
}
static void ownDeconvolution(
enum TestTensorDF fmt,
const void * input_ptr, tensor_desc_t input,
const void * weight_ptr, tensor_desc_t weight,
const void * bias_ptr, tensor_desc_t bias,
vx_size pad_x, vx_size pad_y,
vx_size upscale_x, vx_size upscale_y,
bool wrap, // true for WRAP, else SATURATE
bool to_ne, // true for ROUND_TO_NE, else ROUND_TO_ZERO
vx_size a_x, vx_size a_y,
void * output_ptr, tensor_desc_t output)
{
assert(fmt == TT_Q78 || fmt == TT_U8 || fmt == TT_S8);
assert(input.dim_num == 3 || input.dim_num == 4);
assert(weight.dim_num == 4);
assert(bias.dim_num == 0 || bias.dim_num == 1 || bias.dim_num == 3);
assert(output.dim_num == input.dim_num);
const size_t input_w = input.dims[0];
const size_t input_h = input.dims[1];
const size_t input_c = input.dims[2];
const size_t input_b = input.dim_num > 3 ? input.dims[3] : 1;
const size_t weight_w = weight.dims[0];
const size_t weight_h = weight.dims[1];
const size_t weight_ifm = weight.dims[2];
const size_t weight_ofm = weight.dims[3];
const bool bias_present = !!bias.dim_num;
const bool bias_shared = bias.dim_num == 1;
const size_t bias_w = bias.dim_num > 0 ? bias.dims[0] : 0;
const size_t bias_h = bias.dim_num > 1 ? bias.dims[1] : 1;
const size_t bias_ofm = bias.dim_num > 2 ? bias.dims[2] : 1;
const size_t output_w = output.dims[0];
const size_t output_h = output.dims[1];
const size_t output_c = output.dims[2];
const size_t output_b = output.dim_num > 3 ? output.dims[3] : 1;
assert(weight_ifm == input_c);
assert(weight_ofm == output_c);
assert(upscale_x > 0 && upscale_y > 0);
assert(a_x < upscale_x && a_y < upscale_y);
assert((input_w - 1) * upscale_x + weight_w + a_x > 2 * pad_x);
assert((input_h - 1) * upscale_y + weight_h + a_y > 2 * pad_y);
assert(output_w == (input_w - 1) * upscale_x + weight_w + a_x - 2 * pad_x);
assert(output_h == (input_h - 1) * upscale_y + weight_h + a_y - 2 * pad_y);
assert(weight_w >= pad_x + 1);
assert(weight_h >= pad_y + 1);
const size_t start_x_pad = weight_w - pad_x - 1;
const size_t start_y_pad = weight_h - pad_y - 1;
// NOTE:
// The complete input width being sampled is,
// start_x_pad + ((input_w - 1) * upscale_x + 1) + after_x_pad
// which is
// (input_w - 1) * upscale_x + 2 * weight_w - 2 * pad_x - 1 + a_x
// and the stride being 1, the output width comes down to this plus 1 - weight_w
// which ends up being
// (input_w - 1) * upscale_x + weight_w - 2 * pad_x + a_x
// which is exactly output_w
if (bias_shared)
{
assert(bias_w == weight_ofm);
}
else if (bias_present)
{
assert(bias_w == output_w);
assert(bias_h == output_h);
assert(bias_ofm == output_c);
}
assert(output_b == input_b);
ownAssertStridesModSizeof(fmt, input);
ownAssertStridesModSizeof(fmt, weight);
ownAssertStridesModSizeof(fmt, bias);
ownAssertStridesModSizeof(fmt, output);
// Input and output pointers for the current batch being processed,
// Note: The compiler should've been able to hoist this out... And
// there's a bunch of other possible hoising iopportunities here.
const char * in_b_ptr = input_ptr;
char * out_b_ptr = output_ptr;
for (size_t b = 0; b < output_b; ++b)
for (size_t ofm = 0; ofm < output_c; ++ofm)
for (size_t y = 0; y < output_h; ++y)
for (size_t x = 0; x < output_w; ++x)
{
int32_t sum = 0;
if (bias_present)
{
const size_t bias_byte_offset =
bias_shared
? (bias.strides[0] * ofm)
: (bias.strides[2] * ofm + bias.strides[1] * y + bias.strides[0] * x);
sum = ownLoadValueAsRawInt(fmt, (char *)bias_ptr + bias_byte_offset);
}
for (size_t ifm = 0; ifm < input_c; ++ifm)
{
for (size_t w_y = 0; w_y < weight_h; ++w_y)
for (size_t w_x = 0; w_x < weight_w; ++w_x)
{
if (x + w_x >= start_x_pad && x + w_x < input_w + start_x_pad &&
y + w_y >= start_y_pad && y + w_y < input_h + start_y_pad)
{
const size_t xx = x + w_x - start_x_pad;
const size_t yy = y + w_y - start_y_pad;
if (xx % upscale_x == 0 && yy % upscale_y == 0)
{
const size_t input_byte_offset =
(b ? input.strides[3] * b : 0) +
input.strides[2] * ifm +
input.strides[1] * (yy / upscale_y) +
input.strides[0] * (xx / upscale_x);
const size_t weight_byte_offset =
weight.strides[3] * ofm +
weight.strides[2] * ifm +
weight.strides[1] * w_y +
weight.strides[0] * w_x;
const int_fast32_t i_val = ownLoadValueAsRawInt(fmt, in_b_ptr + input_byte_offset);
const int_fast32_t w_val = ownLoadValueAsRawInt(fmt, (char *)weight_ptr + weight_byte_offset);
// This is ok since all of them fit into int32_t
sum = ownApplyWrapRoundingToAccum(fmt, i_val * w_val, wrap, to_ne) + sum;
}
}
}
sum = ownWrapOrSat(fmt, sum, wrap);
}
// The step here could be added to the loops instead of recalcing
// if, but does the compiler fail to hoist them out???
const size_t output_byte_offset =
(b ? output.strides[3] * b : 0) +
output.strides[2] * ofm +
output.strides[1] * y +
output.strides[0] * x;
ownStoreRawIntValue(fmt, sum, out_b_ptr + output_byte_offset);
}
}
typedef struct
{
const char * name;
enum TestTensorDF fmt;
vx_size weight_w;
vx_size weight_h;
vx_size padding_x;
vx_size padding_y;
enum vx_convert_policy_e convert_policy;
enum vx_round_policy_e rounding_policy;
vx_size a_x;
vx_size a_y;
int batching_dim;
enum TT_CONVOLUTION_BIAS_TYPE bias_type;
} test_deconvolution_layer_arg;
//TODO: take a more thorugh look at these, taken form conv
#define TT_DECONVOLUTION_CASES_BASE(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,A_X_,A_Y_,BATCH_,BIAS_) \
ARG(NAME_"_SZ_X"#SZ_X_"_Y"#SZ_Y_"_PAD_X"#PAD_X_"_Y"#PAD_Y_"_A_X"#A_X_"_Y"#A_Y_, \
TT_##FMT_, SZ_X_, SZ_Y_, PAD_X_, PAD_Y_, VX_CONVERT_POLICY_##OF_, VX_ROUND_POLICY_TO_##ROUND_, \
A_X_, A_Y_, BATCH_, BIAS_),
#define TT_DECONVOLUTION_CASES_4(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,A_X_,A_Y_,BATCH_) \
TT_DECONVOLUTION_CASES_BASE(NAME_"_NOBIAS",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,A_X_,A_Y_,BATCH_,BIAS_NONE) \
TT_DECONVOLUTION_CASES_BASE(NAME_"_SHAREDBIAS",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,A_X_,A_Y_,BATCH_,BIAS_SHARED) \
TT_DECONVOLUTION_CASES_BASE(NAME_"_PERLOCBIAS",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,A_X_,A_Y_,BATCH_,BIAS_PER_LOC)
#define TT_DECONVOLUTION_CASES_3(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,A_X_,A_Y_) \
TT_DECONVOLUTION_CASES_4(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,A_X_,A_Y_,0) \
TT_DECONVOLUTION_CASES_4(NAME_"_BATCH",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,A_X_,A_Y_,1)
#define TT_DECONVOLUTION_CASES_2(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_) \
TT_DECONVOLUTION_CASES_3(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,0,0) \
TT_DECONVOLUTION_CASES_3(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,0,1) \
TT_DECONVOLUTION_CASES_3(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ROUND_,1,0)
#define TT_DECONVOLUTION_CASES_1(NAME_,FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_) \
TT_DECONVOLUTION_CASES_2(NAME_"_ZERO",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,ZERO) \
TT_DECONVOLUTION_CASES_2(NAME_"_NE",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,OF_,NEAREST_EVEN)
#define TT_DECONVOLUTION_CASES_0(FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_) \
TT_DECONVOLUTION_CASES_1(#FMT_"_WRAP",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,WRAP) \
TT_DECONVOLUTION_CASES_1(#FMT_"_SAT",FMT_,SZ_X_,SZ_Y_,PAD_X_,PAD_Y_,SATURATE)
#define TT_DECONVOLUTION_CASES_EXTRA(FMT_) \
TT_DECONVOLUTION_CASES_0(FMT_,11,11,0,0) \
TT_DECONVOLUTION_CASES_0(FMT_,6,6,0,0) \
TT_DECONVOLUTION_CASES_0(FMT_,5,5,0,0) \
TT_DECONVOLUTION_CASES_0(FMT_,3,3,0,0) \
TT_DECONVOLUTION_CASES_0(FMT_,3,4,1,2)
#define TT_DECONVOLUTION_CASES_ALL() \
TT_DECONVOLUTION_CASES_EXTRA(U8)
TEST_WITH_ARG(TensorNN, testDeconvolutionLayer, test_deconvolution_layer_arg,
TT_DECONVOLUTION_CASES_ALL()
)
{
assert (arg_->fmt == TT_Q78 || arg_->fmt == TT_U8 || arg_->fmt == TT_S8);
assert (arg_->batching_dim >= 0);
assert (arg_->bias_type == BIAS_NONE || arg_->bias_type == BIAS_SHARED || arg_->bias_type == BIAS_PER_LOC);
assert (arg_->convert_policy == VX_CONVERT_POLICY_WRAP ||
arg_->convert_policy == VX_CONVERT_POLICY_SATURATE);
assert (arg_->rounding_policy == VX_ROUND_POLICY_TO_ZERO ||
arg_->rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN);
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims >= (size_t)(3 + arg_->batching_dim));
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type;
vx_uint8 fixed_point_position;
vx_size sizeof_data_type;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
const size_t inout_dim_num = 3 + arg_->batching_dim;
const size_t weight_dim_num = 4;
const size_t bias_dim_num =
arg_->bias_type == BIAS_NONE ? 0 :
arg_->bias_type == BIAS_SHARED ? 1 : 3;
size_t in_dims[4];
size_t weight_dims[4];
size_t bias_dims[3];
size_t out_dims[4];
size_t in_strides[4];
size_t weight_strides[4];
size_t bias_strides[3];
size_t out_strides[4];
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter);
fflush(stdout);
}
size_t input_w, upscale_x, output_w;
ownGetDeconvRandParams(
&rng,
arg_->padding_x, arg_->weight_w,
arg_->a_x,
&input_w, &upscale_x, &output_w);
size_t input_h, upscale_y, output_h;
ownGetDeconvRandParams(
&rng,
arg_->padding_y, arg_->weight_h,
arg_->a_y,
&input_h, &upscale_y, &output_h);
in_dims[0] = input_w;
in_dims[1] = input_h;
for (vx_size i = 2; i < inout_dim_num; ++i)
{
in_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
}
out_dims[0] = output_w;
out_dims[1] = output_h;
out_dims[2] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
for (vx_size i = 3; i < inout_dim_num; ++i)
{
out_dims[i] = in_dims[i];
}
weight_dims[0] = arg_->weight_w;
weight_dims[1] = arg_->weight_h;
weight_dims[2] = in_dims[2];
weight_dims[3] = out_dims[2];
if (bias_dim_num == 1) { bias_dims[0] = out_dims[2]; }
else if (bias_dim_num == 3)
{
bias_dims[0] = out_dims[0];
bias_dims[1] = out_dims[1];
bias_dims[2] = out_dims[2];
}
vx_tensor in_tensor = vxCreateTensor(context_->vx_context_, inout_dim_num, in_dims, data_type, fixed_point_position);
vx_tensor weight_tensor = vxCreateTensor(context_->vx_context_, weight_dim_num, weight_dims, data_type, fixed_point_position);
vx_tensor bias_tensor = bias_dim_num ? vxCreateTensor(context_->vx_context_, bias_dim_num, bias_dims, data_type, fixed_point_position) : NULL;
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, inout_dim_num, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(in_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(weight_tensor, VX_TYPE_TENSOR);
if (bias_dim_num) { ASSERT_VX_OBJECT(in_tensor, VX_TYPE_TENSOR); }
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
ownGetFlatByteStrides(arg_->fmt, in_dims, inout_dim_num, in_strides);
ownGetFlatByteStrides(arg_->fmt, weight_dims, weight_dim_num, weight_strides);
ownGetFlatByteStrides(arg_->fmt, bias_dims, bias_dim_num, bias_strides);
ownGetFlatByteStrides(arg_->fmt, out_dims, inout_dim_num, out_strides);
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t in_dims: { "); for (size_t i = 0; i < inout_dim_num; ++i) { printf("%zu, ", in_dims[i]); } printf(" }, \n");
printf("\t weight_dims: { "); for (size_t i = 0; i < weight_dim_num; ++i) { printf("%zu, ", weight_dims[i]); } printf(" }, \n");
if (bias_dim_num)
{
printf("\t bias_dims: { "); for (size_t i = 0; i < bias_dim_num; ++i) { printf("%zu, ", bias_dims[i]); } printf(" }, \n");
}
printf("\t out_dims: { "); for (size_t i = 0; i < inout_dim_num; ++i) { printf("%zu, ", out_dims[i]); } printf(" }, \n");
printf("\t }\n");
}
const size_t in_bytes = in_dims[inout_dim_num-1] * in_strides[inout_dim_num-1];
const size_t weight_bytes = weight_dims[weight_dim_num-1] * weight_strides[weight_dim_num-1];
const size_t bias_bytes = bias_dim_num ? bias_dims[bias_dim_num-1] * bias_strides[bias_dim_num-1] : 0;
const size_t out_bytes = out_dims[inout_dim_num-1] * out_strides[inout_dim_num-1];
const size_t in_count = in_bytes / sizeof_data_type;
const size_t weight_count = weight_bytes / sizeof_data_type;
const size_t bias_count = bias_bytes / sizeof_data_type;
void * const in = malloc(in_bytes);
void * const weight = malloc(weight_bytes);
void * const bias = bias_dim_num ? malloc(bias_bytes) : NULL;
void * const out = malloc(out_bytes);
void * const refs = malloc(out_bytes);
ASSERT(in && weight && (!bias_count || bias) && out && refs);
{
const int deconv_prod_count = arg_->weight_w * arg_->weight_h * in_dims[2];
ownFillSmallRandData(arg_->fmt, &rng, in_count, deconv_prod_count, in);
ownFillSmallRandData(arg_->fmt, &rng, weight_count, deconv_prod_count, weight);
if (bias_dim_num) { ownFillRandData(arg_->fmt, &rng, bias_count, bias); }
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(in_tensor, inout_dim_num, view_start, in_dims, in_strides, in, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
VX_CALL(vxCopyTensorPatch(weight_tensor, weight_dim_num, view_start, weight_dims, weight_strides, weight, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
if (bias_dim_num)
{
VX_CALL(vxCopyTensorPatch(bias_tensor, bias_dim_num, view_start, bias_dims, bias_strides, bias, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
VX_CALL(vxCopyTensorPatch(out_tensor, inout_dim_num, view_start, out_dims, out_strides, out, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
{
vx_graph graph = vxCreateGraph(context_->vx_context_);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
const vx_nn_deconvolution_params_t params =
{
arg_->padding_x, arg_->padding_y, arg_->convert_policy, arg_->rounding_policy,
// arg_->down_scale_size_rounding,
arg_->a_x, arg_->a_y
};
vx_node node = vxDeconvolutionLayer(graph, in_tensor, weight_tensor, bias_tensor, &params, sizeof(params), out_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
tensor_desc_t in_td = { inout_dim_num, in_dims, in_strides };
tensor_desc_t weight_td = { weight_dim_num, weight_dims, weight_strides };
tensor_desc_t bias_td = { bias_dim_num, bias_dims, bias_strides };
tensor_desc_t out_td = { inout_dim_num, out_dims, out_strides };
ownDeconvolution(
arg_->fmt,
in, in_td,
weight, weight_td,
bias, bias_td,
arg_->padding_x, arg_->padding_y,
upscale_x, upscale_y,
arg_->convert_policy == VX_CONVERT_POLICY_WRAP,
arg_->rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN,
arg_->a_x, arg_->a_y,
refs, out_td);
const vx_size view_start[5] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, inout_dim_num, view_start, out_dims, out_strides, out, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out, out_dims, inout_dim_num, out_strides,
refs, out_dims, inout_dim_num, out_strides,
8, //0, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)refs + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&in_tensor));
VX_CALL(vxReleaseTensor(&weight_tensor));
if (bias_dim_num) { VX_CALL(vxReleaseTensor(&bias_tensor)); }
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, in_tensor);
EXPECT_EQ_PTR(NULL, weight_tensor);
EXPECT_EQ_PTR(NULL, bias_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
free(in);
free(weight);
free(bias);
free(out);
free(refs);
}
}
TESTCASE_TESTS(TensorNN,
/* vx_khr_nn.h function tests */
testConvolutionLayer,
testFullyConnectedLayer,
testPoolingLayer,
testSoftmaxLayer,
// testNormalizationLayer,
testActivationLayer,
testROIPoolingLayer,
testDeconvolutionLayer
)
#endif