blob: 62f9e7947a7779c3082f4adb630277efddff9209 [file] [log] [blame]
/*
* Copyright (c) 2012-2017 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef OPENVX_USE_ENHANCED_VISION
#include <VX/vx.h>
#include <VX/vxu.h>
#include "test_tensor_util.h"
TESTCASE(TensorOp, CT_VXContext, ct_setup_vx_context, 0)
/****************************************************************************
* *
* Test vxTensorAddNode *
* Test vxTensorMultiplyNode *
* Test vxTensorSubtractNode *
* *
***************************************************************************/
enum TestTensorOp
{
TT_ADD,
TT_SUB,
TT_MUL,
};
static void ownCheckAddSubMulResult(
const void * in0_ptr, const vx_size * in0_dims, const vx_size * in0_strides,
const void * in1_ptr, const vx_size * in1_dims, const vx_size * in1_strides,
enum TestTensorDF fmt,
enum TestTensorOp op,
vx_size dim_num,
vx_size out_count,
bool wrap, // true for WRAP, else SATURATE
bool to_ne, // true for ROUND_TO_NE, else ROUND_TO_ZERO (only used for fmt == TT_MUL)
vx_float32 scale, // only used for fmt == TT_MUL
void * out_ptr, const vx_size * out_dims, const vx_size * out_strides)
{
double q78_scale = (double)scale / Q78_SCALE;
for (size_t index = 0; index < out_count; ++index)
{
const size_t in0_byte_offset = ownGetFlatByteOffsetWithBroadcast(index, dim_num, in0_dims, in0_strides, out_dims);
const size_t in1_byte_offset = ownGetFlatByteOffsetWithBroadcast(index, dim_num, in1_dims, in1_strides, out_dims);
const size_t out_byte_offset = ownGetFlatByteOffset(index, dim_num, out_dims, out_strides);
const char * in0_b_ptr = (char*)in0_ptr + in0_byte_offset;
const char * in1_b_ptr = (char*)in1_ptr + in1_byte_offset;
const char * out_b_ptr = (char*)out_ptr + out_byte_offset;
switch (fmt)
{
case TT_Q78:
{
const vx_int16 in0 = *(vx_int16*)in0_b_ptr;
const vx_int16 in1 = *(vx_int16*)in1_b_ptr;
const vx_int16 out = *(vx_int16*)out_b_ptr;
int16_t ref = 0;
switch (op)
{
case TT_ADD:
{
int32_t tmp = in0 + in1;
ref = wrap ? trunc_to_int16(tmp) : CLAMP(tmp, INT16_MIN, INT16_MAX);
}
break;
case TT_SUB:
{
int32_t tmp = in0 - in1;
ref = wrap ? trunc_to_int16(tmp) : CLAMP(tmp, INT16_MIN, INT16_MAX);
}
break;
case TT_MUL:
{
double tmp = in0 * in1 * q78_scale;
tmp = to_ne ? nearbyint(tmp) : trunc(tmp);
ref = wrap ? trunc_to_int16(tmp) : CLAMP(tmp, INT16_MIN, INT16_MAX);
}
break;
default: assert(0);
}
const int max_raw_int_diff = 1;
if (I64_ABS_DIFF(ref, out) > max_raw_int_diff) {
printf("DIFF!!! { idx: %zu, in0: %f, in0_raw: 0x%04X, in1: %f, in1_raw: 0x%04X, out: %f, out_raw: 0x%04X, ref: %f, ref_raw: 0x%04X\n }\n",
index, in0 / 256.f, in0, in1 / 256.f, in1, out / 256.f, out, ref / 256.f, ref);
if (max_raw_int_diff)
{
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
ASSERT_EQ_INT(I64_ABS_DIFF(ref, out) > max_raw_int_diff, 0);
else
EXPECT_EQ_INT(I64_ABS_DIFF(ref, out) > max_raw_int_diff, 0);
}
else
{
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
ASSERT_EQ_INT(ref, out);
else
EXPECT_EQ_INT(ref, out);
}
}
}
break;
case TT_U8:
{
const vx_uint8 in0 = *(vx_uint8*)in0_b_ptr;
const vx_uint8 in1 = *(vx_uint8*)in1_b_ptr;
const vx_uint8 out = *(vx_uint8*)out_b_ptr;
uint8_t ref = 0;
switch (op)
{
case TT_ADD:
{
int32_t tmp = in0 + in1;
ref = wrap ? tmp : CLAMP(tmp, 0, UINT8_MAX);
}
break;
case TT_SUB:
{
int32_t tmp = in0 - in1;
ref = wrap ? tmp : CLAMP(tmp, 0, UINT8_MAX);
}
break;
case TT_MUL:
{
double tmp = in0 * in1 * scale;
tmp = to_ne ? nearbyint(tmp) : trunc(tmp);
ref = wrap ? tmp : CLAMP(tmp, 0, UINT8_MAX);
}
break;
default: assert(0);
}
if (ref != out)
printf("DIFF!!! { idx: %zu, in0: %d, in1: %d, out: %d, ref: %d }\n", index, in0, in1, out, ref);
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT_EQ_INT(out, ref); else EXPECT_EQ_INT(out, ref);
}
break;
case TT_S8:
{
const vx_int8 in0 = *(vx_int8*)in0_b_ptr;
const vx_int8 in1 = *(vx_int8*)in1_b_ptr;
const vx_int8 out = *(vx_int8*)out_b_ptr;
int8_t ref = 0;
switch (op)
{
case TT_ADD:
{
int32_t tmp = in0 + in1;
ref = wrap ? trunc_to_int8(tmp) : CLAMP(tmp, INT8_MIN, INT8_MAX);
}
break;
case TT_SUB:
{
int32_t tmp = in0 - in1;
ref = wrap ? trunc_to_int8(tmp) : CLAMP(tmp, INT8_MIN, INT8_MAX);
}
break;
case TT_MUL:
{
double tmp = in0 * in1 * scale;
tmp = to_ne ? nearbyint(tmp) : trunc(tmp);
ref = wrap ? trunc_to_int8(tmp) : CLAMP(tmp, INT8_MIN, INT8_MAX);
}
break;
default: assert(0);
}
if (ref != out)
printf("DIFF!!! { idx: %zu, in0: %d, in1: %d, out: %d, ref: %d }\n", index, in0, in1, out, ref);
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT_EQ_INT(out, ref); else EXPECT_EQ_INT(out, ref);
}
break;
default: assert(0);
}
}
}
typedef struct
{
const char * name;
enum TestTensorDF fmt;
enum TestTensorOp op;
enum vx_convert_policy_e convert_policy;
enum vx_round_policy_e rounding_policy;
vx_float32 scale;
} test_tensor_elementwise_op_arg;
#define TT_ELEMENTWISE_OP_BASE(NAME_,FMT_,OP_,OF_,ROUND_,SCALE_) \
ARG(NAME_,TT_##FMT_,TT_##OP_,VX_CONVERT_POLICY_##OF_,VX_ROUND_POLICY_TO_##ROUND_,SCALE_),
#define TT_ELEMENTWISE_OP_MUL2(NAME_,FMT_,OF_,ROUND_) \
TT_ELEMENTWISE_OP_BASE(NAME_"_1f",FMT_,MUL,OF_,ROUND_,1.f) \
TT_ELEMENTWISE_OP_BASE(NAME_"_1_255f",FMT_,MUL,OF_,ROUND_,(1.f/255)) \
TT_ELEMENTWISE_OP_BASE(NAME_"_1_32768f",FMT_,MUL,OF_,ROUND_,(1.f/(1<<15)))
#define TT_ELEMENTWISE_OP_MUL(NAME_,FMT_,OF_) \
TT_ELEMENTWISE_OP_MUL2(NAME_"_ZERO",FMT_,OF_,ZERO) \
TT_ELEMENTWISE_OP_MUL2(NAME_"_NE",FMT_,OF_,NEAREST_EVEN)
#define TT_ELEMENTWISE_OP1(NAME_,FMT_,OF_) \
TT_ELEMENTWISE_OP_BASE(NAME_"_ADD",FMT_,ADD,OF_,ZERO, 1) \
TT_ELEMENTWISE_OP_BASE(NAME_"_SUB",FMT_,SUB,OF_,ZERO, 1) \
TT_ELEMENTWISE_OP_MUL(NAME_"_MUL",FMT_,OF_)
#define TT_ELEMENTWISE_OP0(FMT_) \
TT_ELEMENTWISE_OP1(#FMT_"_WRAP",FMT_,WRAP) \
TT_ELEMENTWISE_OP1(#FMT_"_SAT",FMT_,SATURATE)
#define TT_ELEMENTWISE_OP_ALL() \
TT_ELEMENTWISE_OP0(Q78) \
TT_ELEMENTWISE_OP0(U8) \
TT_ELEMENTWISE_OP0(S8)
TEST_WITH_ARG(TensorOp, testvxTensorElementwiseOp, test_tensor_elementwise_op_arg,
TT_ELEMENTWISE_OP_ALL()
)
{
const vx_context context = context_->vx_context_;
const enum TestTensorDF fmt = arg_->fmt;
const enum TestTensorOp op = arg_->op;
const enum vx_convert_policy_e overflow_policy = arg_->convert_policy;
const enum vx_round_policy_e rounding_policy = arg_->rounding_policy;
const vx_float32 scale = arg_->scale;
assert(fmt == TT_Q78 || fmt == TT_U8 || fmt == TT_S8);
assert(op == TT_ADD || op == TT_SUB || op == TT_MUL);
assert(overflow_policy == VX_CONVERT_POLICY_WRAP || overflow_policy == VX_CONVERT_POLICY_SATURATE);
assert(rounding_policy == VX_ROUND_POLICY_TO_ZERO || rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN);
// Only MUL supports rounding_policy and scale, we chose not to allow anything but default values for other ops
assert(TT_MUL || (rounding_policy == VX_ROUND_POLICY_TO_ZERO && scale == 1.f));
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type = 0;
vx_uint8 fixed_point_position = 0;
vx_size sizeof_data_type = 0;
ownUnpackFormat(fmt, &data_type, &fixed_point_position, &sizeof_data_type);
size_t * const in0_dims = ct_alloc_mem(sizeof(*in0_dims) * max_dims);
size_t * const in1_dims = ct_alloc_mem(sizeof(*in1_dims) * max_dims);
size_t * const out_dims = ct_alloc_mem(sizeof(*out_dims) * max_dims);
ASSERT(in0_dims && in1_dims && out_dims);
size_t * const in0_strides = ct_alloc_mem(sizeof(*in0_strides) * max_dims);
size_t * const in1_strides = ct_alloc_mem(sizeof(*in1_strides) * max_dims);
size_t * const out_strides = ct_alloc_mem(sizeof(*out_strides) * max_dims);
ASSERT(in0_strides && in1_strides && out_strides);
// The test strategy is a simple one: For each of the 1..max_dims supported
// we test a TEST_TENSOR_NUM_ITERATIONS of random dim and broadcast
// configurations. This approach may have issues if the implementation
// supports a lot of dimensions since their random size being up to
// TEST_TENSOR_MAX_DIM_SZ, could result in a huge memory requirement.
// However from previous experience we expect this to typically be 4-6 dims.
// The other issue is that we do not test huge dimensions.
// Further limitations include lack of virtual/ view inputs and outputs as -
// well as lack of modified stride testing etc.
// Note that iter is the inner loop, so that if two implementations support
// D1 and D2 dims resp. the same (pseudo-random) values would be used when
// testing the common min(D1, D2) dimensions.
for (vx_size dims = 1; dims <= max_dims; ++dims)
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("dims #: %zu,\titer #: %d\n", dims, iter);
fflush(stdout);
}
// First step is to get some random dim sizes, calc the strides and create the tensors.
for (vx_size i = 0; i < dims; ++i)
{
const size_t new_dim = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
const int mask0 = !!CT_RNG_NEXT_INT(rng, 0, TEST_TENSOR_INVERSE_MASK_PROBABILITY);
const int mask1 = !!CT_RNG_NEXT_INT(rng, 0, TEST_TENSOR_INVERSE_MASK_PROBABILITY);
// Note: Broadcasting is described as for each dim, either in0 and in1 have the same
// size or "1" for a broadcasted value. And the output is strictly determined by them
// so that the implementation is required to support
// { in0, in1, out } = { 1, 5, 5 } but not { in0, in1, out } = { 1, 1, 5 }
// even though the KHR sample implementation currently supports both.
in0_dims[i] = mask0 ? new_dim : 1;
in1_dims[i] = mask1 ? new_dim : 1;
out_dims[i] = mask0 || mask1 ? new_dim : 1;
in0_strides[i] = i ? in0_strides[i - 1] * in0_dims[i - 1] : sizeof_data_type;
in1_strides[i] = i ? in1_strides[i - 1] * in1_dims[i - 1] : sizeof_data_type;
out_strides[i] = i ? out_strides[i - 1] * out_dims[i - 1] : sizeof_data_type;
}
vx_tensor in0_tensor = vxCreateTensor(context, dims, in0_dims, data_type, fixed_point_position);
vx_tensor in1_tensor = vxCreateTensor(context, dims, in1_dims, data_type, fixed_point_position);
vx_tensor out_tensor = vxCreateTensor(context, dims, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(in0_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(in1_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
const size_t in0_bytes = in0_dims[dims - 1] * in0_strides[dims - 1];
const size_t in1_bytes = in1_dims[dims - 1] * in1_strides[dims - 1];
const size_t out_bytes = out_dims[dims - 1] * out_strides[dims - 1];
const size_t in0_count = in0_bytes / sizeof_data_type;
const size_t in1_count = in1_bytes / sizeof_data_type;
const size_t out_count = out_bytes / sizeof_data_type;
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t dim_num: %zu,\n", dims);
printf("\t in0 : { dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", in0_dims[i]); } printf(" }, count: %zu, bytes: %zu },\n", in0_count, in0_bytes);
printf("\t in1 : { dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", in1_dims[i]); } printf(" }, count: %zu, bytes: %zu },\n", in1_count, in1_bytes);
printf("\t out : { dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", out_dims[i]); } printf(" }, count: %zu, bytes: %zu },\n", out_count, out_bytes);
printf("\t }\n");
}
//TODO: This is pretty wasteful as it's repeating a lot of work per iteration:
// Both in the repeated malloc + free and inefficient data population
// which discards much of the random data, only using a part of it.
// Second step is to allocate the input and output data locations and populate the inputs.
void * const in0_data = ct_alloc_mem(in0_bytes);
void * const in1_data = ct_alloc_mem(in1_bytes);
void * const out_data = ct_alloc_mem(out_bytes);
ASSERT(in0_data && in1_data && out_data);
{
ownFillRandData(fmt, &rng, in0_count, in0_data);
ownFillRandData(fmt, &rng, in1_count, in1_data);
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(in0_tensor, dims, view_start, in0_dims, in0_strides, in0_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
VX_CALL(vxCopyTensorPatch(in1_tensor, dims, view_start, in1_dims, in1_strides, in1_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
vx_graph graph = vxCreateGraph(context);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_node node = NULL;
switch (op)
{
case TT_ADD:
node = vxTensorAddNode(graph, in0_tensor, in1_tensor, overflow_policy, out_tensor);
break;
case TT_SUB:
node = vxTensorSubtractNode(graph, in0_tensor, in1_tensor, overflow_policy, out_tensor);
break;
case TT_MUL:
{
vx_scalar scalar = vxCreateScalar(context, VX_TYPE_FLOAT32, &scale);
ASSERT_VX_OBJECT(scalar, VX_TYPE_SCALAR);
node = vxTensorMultiplyNode(graph, in0_tensor, in1_tensor, scalar, overflow_policy, rounding_policy, out_tensor);
VX_CALL(vxReleaseScalar(&scalar));
EXPECT_EQ_PTR(NULL, scalar);
break;
}
default:
ASSERT(0);
// Not implemented;
}
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
const size_t view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, dims, view_start, out_dims, out_strides, out_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
ownCheckAddSubMulResult(
in0_data, in0_dims, in0_strides,
in1_data, in1_dims, in1_strides,
fmt,
op,
dims,
out_count,
overflow_policy == VX_CONVERT_POLICY_WRAP,
rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN,
scale,
out_data, out_dims, out_strides);
}
VX_CALL(vxReleaseTensor(&in0_tensor));
VX_CALL(vxReleaseTensor(&in1_tensor));
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, in0_tensor);
EXPECT_EQ_PTR(NULL, in1_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
ct_free_mem(in0_data);
ct_free_mem(in1_data);
ct_free_mem(out_data);
}
ct_free_mem(in0_dims);
ct_free_mem(in1_dims);
ct_free_mem(out_dims);
ct_free_mem(in0_strides);
ct_free_mem(in1_strides);
ct_free_mem(out_strides);
}
TEST_WITH_ARG(TensorOp, testvxuTensorElementwiseOp, test_tensor_elementwise_op_arg,
TT_ELEMENTWISE_OP_ALL()
)
{
const vx_context context = context_->vx_context_;
const enum TestTensorDF fmt = arg_->fmt;
const enum TestTensorOp op = arg_->op;
const enum vx_convert_policy_e overflow_policy = arg_->convert_policy;
const enum vx_round_policy_e rounding_policy = arg_->rounding_policy;
const vx_float32 scale = arg_->scale;
assert(fmt == TT_Q78 || fmt == TT_U8 || fmt == TT_S8);
assert(op == TT_ADD || op == TT_SUB || op == TT_MUL);
assert(overflow_policy == VX_CONVERT_POLICY_WRAP || overflow_policy == VX_CONVERT_POLICY_SATURATE);
assert(rounding_policy == VX_ROUND_POLICY_TO_ZERO || rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN);
// Only MUL supports rounding_policy and scale, we chose not to allow anything but default values for other ops
assert(TT_MUL || (rounding_policy == VX_ROUND_POLICY_TO_ZERO && scale == 1.f));
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type = 0;
vx_uint8 fixed_point_position = 0;
vx_size sizeof_data_type = 0;
ownUnpackFormat(fmt, &data_type, &fixed_point_position, &sizeof_data_type);
size_t * const in0_dims = ct_alloc_mem(sizeof(*in0_dims) * max_dims);
size_t * const in1_dims = ct_alloc_mem(sizeof(*in1_dims) * max_dims);
size_t * const out_dims = ct_alloc_mem(sizeof(*out_dims) * max_dims);
ASSERT(in0_dims && in1_dims && out_dims);
size_t * const in0_strides = ct_alloc_mem(sizeof(*in0_strides) * max_dims);
size_t * const in1_strides = ct_alloc_mem(sizeof(*in1_strides) * max_dims);
size_t * const out_strides = ct_alloc_mem(sizeof(*out_strides) * max_dims);
ASSERT(in0_strides && in1_strides && out_strides);
// The test strategy is a simple one: For each of the 1..max_dims supported
// we test a TEST_TENSOR_NUM_ITERATIONS of random dim and broadcast
// configurations. This approach may have issues if the implementation
// supports a lot of dimensions since their random size being up to
// TEST_TENSOR_MAX_DIM_SZ, could result in a huge memory requirement.
// However from previous experience we expect this to typically be 4-6 dims.
// The other issue is that we do not test huge dimensions.
// Further limitations include lack of virtual/ view inputs and outputs as -
// well as lack of modified stride testing etc.
// Note that iter is the inner loop, so that if two implementations support
// D1 and D2 dims resp. the same (pseudo-random) values would be used when
// testing the common min(D1, D2) dimensions.
for (vx_size dims = 1; dims <= max_dims; ++dims)
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("dims #: %zu,\titer #: %d\n", dims, iter);
fflush(stdout);
}
// First step is to get some random dim sizes, calc the strides and create the tensors.
for (vx_size i = 0; i < dims; ++i)
{
const size_t new_dim = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
const int mask0 = !!CT_RNG_NEXT_INT(rng, 0, TEST_TENSOR_INVERSE_MASK_PROBABILITY);
const int mask1 = !!CT_RNG_NEXT_INT(rng, 0, TEST_TENSOR_INVERSE_MASK_PROBABILITY);
// Note: Broadcasting is described as for each dim, either in0 and in1 have the same
// size or "1" for a broadcasted value. And the output is strictly determined by them
// so that the implementation is required to support
// { in0, in1, out } = { 1, 5, 5 } but not { in0, in1, out } = { 1, 1, 5 }
// even though the KHR sample implementation currently supports both.
in0_dims[i] = mask0 ? new_dim : 1;
in1_dims[i] = mask1 ? new_dim : 1;
out_dims[i] = mask0 || mask1 ? new_dim : 1;
in0_strides[i] = i ? in0_strides[i - 1] * in0_dims[i - 1] : sizeof_data_type;
in1_strides[i] = i ? in1_strides[i - 1] * in1_dims[i - 1] : sizeof_data_type;
out_strides[i] = i ? out_strides[i - 1] * out_dims[i - 1] : sizeof_data_type;
}
vx_tensor in0_tensor = vxCreateTensor(context, dims, in0_dims, data_type, fixed_point_position);
vx_tensor in1_tensor = vxCreateTensor(context, dims, in1_dims, data_type, fixed_point_position);
vx_tensor out_tensor = vxCreateTensor(context, dims, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(in0_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(in1_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
const size_t in0_bytes = in0_dims[dims - 1] * in0_strides[dims - 1];
const size_t in1_bytes = in1_dims[dims - 1] * in1_strides[dims - 1];
const size_t out_bytes = out_dims[dims - 1] * out_strides[dims - 1];
const size_t in0_count = in0_bytes / sizeof_data_type;
const size_t in1_count = in1_bytes / sizeof_data_type;
const size_t out_count = out_bytes / sizeof_data_type;
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t dim_num: %zu,\n", dims);
printf("\t in0 : { dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", in0_dims[i]); } printf(" }, count: %zu, bytes: %zu },\n", in0_count, in0_bytes);
printf("\t in1 : { dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", in1_dims[i]); } printf(" }, count: %zu, bytes: %zu },\n", in1_count, in1_bytes);
printf("\t out : { dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", out_dims[i]); } printf(" }, count: %zu, bytes: %zu },\n", out_count, out_bytes);
printf("\t }\n");
}
//TODO: This is pretty wasteful as it's repeating a lot of work per iteration:
// Both in the repeated malloc + free and inefficient data population
// which discards much of the random data, only using a part of it.
// Second step is to allocate the input and output data locations and populate the inputs.
void * const in0_data = ct_alloc_mem(in0_bytes);
void * const in1_data = ct_alloc_mem(in1_bytes);
void * const out_data = ct_alloc_mem(out_bytes);
ASSERT(in0_data && in1_data && out_data);
{
ownFillRandData(fmt, &rng, in0_count, in0_data);
ownFillRandData(fmt, &rng, in1_count, in1_data);
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(in0_tensor, dims, view_start, in0_dims, in0_strides, in0_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
VX_CALL(vxCopyTensorPatch(in1_tensor, dims, view_start, in1_dims, in1_strides, in1_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
switch (op)
{
case TT_ADD:
VX_CALL(vxuTensorAdd(context, in0_tensor, in1_tensor, overflow_policy, out_tensor));
break;
case TT_SUB:
VX_CALL(vxuTensorSubtract(context, in0_tensor, in1_tensor, overflow_policy, out_tensor));
break;
case TT_MUL:
{
vx_scalar scalar = vxCreateScalar(context, VX_TYPE_FLOAT32, &scale);
ASSERT_VX_OBJECT(scalar, VX_TYPE_SCALAR);
VX_CALL(vxuTensorMultiply(context, in0_tensor, in1_tensor, scalar, overflow_policy, rounding_policy, out_tensor));
VX_CALL(vxReleaseScalar(&scalar));
EXPECT_EQ_PTR(NULL, scalar);
break;
}
default:
ASSERT(0);
// Not implemented;
}
}
// Verify the results
{
const size_t view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(out_tensor, dims, view_start, out_dims, out_strides, out_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
ownCheckAddSubMulResult(
in0_data, in0_dims, in0_strides,
in1_data, in1_dims, in1_strides,
fmt,
op,
dims,
out_count,
overflow_policy == VX_CONVERT_POLICY_WRAP,
rounding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN,
scale,
out_data, out_dims, out_strides);
}
VX_CALL(vxReleaseTensor(&in0_tensor));
VX_CALL(vxReleaseTensor(&in1_tensor));
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, in0_tensor);
EXPECT_EQ_PTR(NULL, in1_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
ct_free_mem(in0_data);
ct_free_mem(in1_data);
ct_free_mem(out_data);
}
ct_free_mem(in0_dims);
ct_free_mem(in1_dims);
ct_free_mem(out_dims);
ct_free_mem(in0_strides);
ct_free_mem(in1_strides);
ct_free_mem(out_strides);
}
/****************************************************************************
* *
* LUT Test *
* *
***************************************************************************/
static void ownFillRandDataForLUT(
enum TestTensorDF fmt,
uint64_t * rng,
size_t index_count,
size_t lut_count,
size_t lut_offset,
/*OUT*/ void * data)
{
switch(fmt)
{
case TT_Q78:
for(size_t i = 0; i < index_count; ++i)
((vx_int16*)data)[i] = (vx_int16)(CT_RNG_NEXT_INT(*rng, 0, lut_count) - lut_offset);
break;
case TT_U8:
for(size_t i = 0; i < index_count; ++i)
((vx_uint8*)data)[i] = (vx_uint8)(CT_RNG_NEXT_INT(*rng, 0, lut_count) - lut_offset);
break;
// case TT_S8:
// for(size_t i = 0; i < index_count; ++i)
// ((vx_int8*)data)[i] = (vx_int8)(CT_RNG_NEXT_INT(*rng, 0, lut_count) - lut_offset);
// break;
default:
assert(0);
}
}
static void ownUnpackFormatForLUT(
enum TestTensorDF fmt,
/*OUT*/ vx_size * lut_max_count,
/*OUT*/ vx_enum * lut_data_type)
{
switch(fmt)
{
case TT_Q78:
*lut_max_count = UINT16_MAX;
*lut_data_type = VX_TYPE_INT16;
break;
case TT_U8:
// case TT_S8:
*lut_max_count = UINT8_MAX;
*lut_data_type = VX_TYPE_UINT8;
break;
default:
assert(0);
}
}
//TODO: test_tensor_lut_op_arg and test_tensor_transpose_op_arg are basically identical - unify them?
typedef struct
{
const char * name;
enum TestTensorDF fmt;
} test_tensor_lut_op_arg;
TEST_WITH_ARG(TensorOp, testvxTensorTableLookup, test_tensor_lut_op_arg,
ARG("Q78_TABLELOOKUP", TT_Q78),
ARG("U8_TABLELOOKUP", TT_U8),
// ARG("S8_TABLELOOKUP", TT_S8),
)
{
const vx_context context = context_->vx_context_;
const enum TestTensorDF fmt = arg_->fmt;
assert(fmt == TT_Q78 || fmt == TT_U8);// || fmt == TT_S8);
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type = 0;
vx_uint8 fixed_point_position = 0;
vx_size sizeof_data_type = 0;
ownUnpackFormat(fmt, &data_type, &fixed_point_position, &sizeof_data_type);
vx_size lut_max_count = 0;
vx_enum lut_data_type = 0;
ownUnpackFormatForLUT(fmt, &lut_max_count, &lut_data_type);
size_t * const tensor_dims = ct_alloc_mem(sizeof(*tensor_dims) * max_dims);
size_t * const tensor_strides = ct_alloc_mem(sizeof(*tensor_strides) * max_dims);
ASSERT(tensor_dims && tensor_strides);
// The strategy is a simple one: For each of the 1..max_dims supported,
// we test a TEST_TENSOR_NUM_ITERATIONS of random tensor and LUT configs.
// While LUT should be (Not verified) sufficiently tested by the Non NN
// tests, we preffer to use random LUT dims and data each iteration anyway
// since the whole test should't take long and this can be used as a
// standalone conformance part for our own tests.
//TODO: @Tomer, should we rather use a single (per fmt) randomly populated
// LUT, instead, anyway? Or is this acceptable?
// Note that iter is the inner loop, so that if two implementations support
// D1 and D2 dims resp. the same (pseudo-random) values would be used when
// testing the common min(D1, D2) dimensions.
for (vx_size dims = 1; dims <= max_dims; ++dims)
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("dims #: %zu,\titer #: %d\n", dims, iter);
fflush(stdout);
}
// First step is to get some random dim sizes, calc the strides and create the tensors.
for (vx_size i = 0; i < dims; ++i)
{
tensor_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
tensor_strides[i] = i ? tensor_strides[i-1] * tensor_dims[i-1] : sizeof_data_type;
}
vx_tensor src_tensor = vxCreateTensor(context, dims, tensor_dims, data_type, fixed_point_position);
vx_tensor dst_tensor = vxCreateTensor(context, dims, tensor_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(src_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(dst_tensor, VX_TYPE_TENSOR);
const size_t tensor_bytes = tensor_dims[dims-1] * tensor_strides[dims-1];
const size_t tensor_count = tensor_bytes / sizeof_data_type;
const vx_size lut_count = (vx_size)CT_RNG_NEXT_INT(rng, 1, lut_max_count+1);
const vx_uint32 lut_offset = (lut_data_type == VX_TYPE_INT16) ? (vx_uint32)(lut_count / 2) : 0;
vx_lut lut = vxCreateLUT(context, lut_data_type, lut_count);
ASSERT_VX_OBJECT(lut, VX_TYPE_LUT);
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t tensor_dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", tensor_dims[i]); } printf(" }, \n");
printf("\t LUT_count: %zu,", lut_count);
printf("\t }\n");
}
void * const src_data = ct_alloc_mem(tensor_bytes);
void * const dst_data = ct_alloc_mem(tensor_bytes);
void * const lut_data = ct_alloc_mem(sizeof_data_type * lut_count);
ASSERT(src_data && dst_data && lut_data);
{ //TODO: ownTestInitTensors(..) ?
ownFillRandDataForLUT(fmt, &rng, tensor_count, lut_count, lut_offset, src_data);
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(src_tensor, dims, view_start, tensor_dims, tensor_strides, src_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
{
for (size_t i = 0; i < lut_count; ++i)
{
switch (fmt)
{
case TT_Q78:
((vx_int16*)lut_data)[i] = (vx_int16)(CT_RNG_NEXT_INT(rng, INT16_MIN, INT16_MAX + 1));
break;
case TT_U8:
// case TT_S8:
((vx_uint8*)lut_data)[i] = (vx_uint8)(CT_RNG_NEXT_INT(rng, 0, UINT8_MAX + 1));
break;
default: assert(0);
}
}
VX_CALL(vxCopyLUT(lut, lut_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
vx_graph graph = vxCreateGraph(context);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_node node = vxTensorTableLookupNode(graph, src_tensor, lut, dst_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the reuslts
{
const size_t view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(dst_tensor, dims, view_start, tensor_dims, tensor_strides, dst_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
for (size_t index = 0; index < tensor_count; ++index)
{
const size_t tensor_byte_offset = ownGetFlatByteOffset(index, dims, tensor_dims, tensor_strides);
switch(fmt)
{
case TT_Q78:
{
const vx_int16 res = *(vx_int16*)((char*)dst_data + tensor_byte_offset);
const vx_int16 val = *(vx_int16*)((char*)src_data + tensor_byte_offset);
const int16_t ref = *((vx_int16*)lut_data + (size_t)((int32_t)lut_offset + (int32_t)val));
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu] : %f (raw: %d), LUT[%d + %u]: %f (raw: %d), res[%zu]: %f (raw: %d) }\n",
tensor_byte_offset / sizeof(vx_int16), val / 256.f, val,
val, lut_offset, ref / 256.f, ref,
tensor_byte_offset / sizeof(vx_int16), res / 256.f, res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
case TT_U8:
{
const vx_uint8 res = *(vx_uint8*)((char*)dst_data + tensor_byte_offset);
const vx_uint8 val = *(vx_uint8*)((char*)src_data + tensor_byte_offset);
const uint8_t ref = *((vx_uint8*)lut_data + (size_t)((int32_t)lut_offset + (int32_t)val));
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu] : %d, LUT[%d + %u]: %d, res[%zu]: %d }\n",
tensor_byte_offset / sizeof(vx_uint8), val,
val, lut_offset, ref,
tensor_byte_offset / sizeof(vx_uint8), res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
// case TT_S8:
// {
// const vx_int8 res = *(vx_int8*)(dst_data + tensor_byte_offset);
// const uint8_t val = *(uint8_t*)(src_data + tensor_byte_offset);
// const vx_int8 ref = *((vx_int8*)lut_data + (size_t)((int32_t)lut_offset + (int32_t)val));
//
// if (res != ref)
// {
// printf("DIFF!!!\t\t{ src[%zu] : %d, LUT[%d + %u]: %d, res[%zu]: %d }\n",
// tensor_byte_offset / sizeof(vx_int8), val,
// val, lut_offset, ref,
// tensor_byte_offset / sizeof(vx_int8), res);
// }
// if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
// {
// ASSERT_EQ_INT(res, ref);
// }
// else
// {
// EXPECT_EQ_INT(res, ref);
// }
// }
// break;
default: assert(0);
}
}
}
VX_CALL(vxReleaseTensor(&src_tensor));
VX_CALL(vxReleaseTensor(&dst_tensor));
VX_CALL(vxReleaseLUT(&lut));
EXPECT_EQ_PTR(NULL, src_tensor);
EXPECT_EQ_PTR(NULL, dst_tensor);
EXPECT_EQ_PTR(NULL, lut);
ct_free_mem(src_data);
ct_free_mem(dst_data);
}
ct_free_mem(tensor_dims);
ct_free_mem(tensor_strides);
}
TEST_WITH_ARG(TensorOp, testvxuTensorTableLookup, test_tensor_lut_op_arg,
ARG("Q78_TABLELOOKUP", TT_Q78),
ARG("U8_TABLELOOKUP", TT_U8)
)
{
const vx_context context = context_->vx_context_;
const enum TestTensorDF fmt = arg_->fmt;
assert(fmt == TT_Q78 || fmt == TT_U8);
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type = 0;
vx_uint8 fixed_point_position = 0;
vx_size sizeof_data_type = 0;
ownUnpackFormat(fmt, &data_type, &fixed_point_position, &sizeof_data_type);
vx_size lut_max_count = 0;
vx_enum lut_data_type = 0;
ownUnpackFormatForLUT(fmt, &lut_max_count, &lut_data_type);
size_t * const tensor_dims = ct_alloc_mem(sizeof(*tensor_dims) * max_dims);
size_t * const tensor_strides = ct_alloc_mem(sizeof(*tensor_strides) * max_dims);
ASSERT(tensor_dims && tensor_strides);
// The strategy is a simple one: For each of the 1..max_dims supported,
// we test a TEST_TENSOR_NUM_ITERATIONS of random tensor and LUT configs.
// While LUT should be (Not verified) sufficiently tested by the Non NN
// tests, we preffer to use random LUT dims and data each iteration anyway
// since the whole test should't take long and this can be used as a
// standalone conformance part for our own tests.
//TODO: @Tomer, should we rather use a single (per fmt) randomly populated
// LUT, instead, anyway? Or is this acceptable?
// Note that iter is the inner loop, so that if two implementations support
// D1 and D2 dims resp. the same (pseudo-random) values would be used when
// testing the common min(D1, D2) dimensions.
for (vx_size dims = 1; dims <= max_dims; ++dims)
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("dims #: %zu,\titer #: %d\n", dims, iter);
fflush(stdout);
}
// First step is to get some random dim sizes, calc the strides and create the tensors.
for (vx_size i = 0; i < dims; ++i)
{
tensor_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
tensor_strides[i] = i ? tensor_strides[i-1] * tensor_dims[i-1] : sizeof_data_type;
}
vx_tensor src_tensor = vxCreateTensor(context, dims, tensor_dims, data_type, fixed_point_position);
vx_tensor dst_tensor = vxCreateTensor(context, dims, tensor_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(src_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(dst_tensor, VX_TYPE_TENSOR);
const size_t tensor_bytes = tensor_dims[dims-1] * tensor_strides[dims-1];
const size_t tensor_count = tensor_bytes / sizeof_data_type;
const vx_size lut_count = (vx_size)CT_RNG_NEXT_INT(rng, 1, lut_max_count+1);
const vx_uint32 lut_offset = (lut_data_type == VX_TYPE_INT16) ? (vx_uint32)(lut_count / 2) : 0;
vx_lut lut = vxCreateLUT(context, lut_data_type, lut_count);
ASSERT_VX_OBJECT(lut, VX_TYPE_LUT);
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t tensor_dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", tensor_dims[i]); } printf(" }, \n");
printf("\t LUT_count: %zu,", lut_count);
printf("\t }\n");
}
void * const src_data = ct_alloc_mem(tensor_bytes);
void * const dst_data = ct_alloc_mem(tensor_bytes);
void * const lut_data = ct_alloc_mem(sizeof_data_type * lut_count);
ASSERT(src_data && dst_data && lut_data);
{ //TODO: ownTestInitTensors(..) ?
ownFillRandDataForLUT(fmt, &rng, tensor_count, lut_count, lut_offset, src_data);
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(src_tensor, dims, view_start, tensor_dims, tensor_strides, src_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
{
for (size_t i = 0; i < lut_count; ++i)
{
switch (fmt)
{
case TT_Q78:
((vx_int16*)lut_data)[i] = (vx_int16)(CT_RNG_NEXT_INT(rng, INT16_MIN, INT16_MAX + 1));
break;
case TT_U8:
((vx_uint8*)lut_data)[i] = (vx_uint8)(CT_RNG_NEXT_INT(rng, 0, UINT8_MAX + 1));
break;
default: assert(0);
}
}
VX_CALL(vxCopyLUT(lut, lut_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
VX_CALL(vxuTensorTableLookup(context, src_tensor, lut, dst_tensor));
}
// Verify the reuslts
{
const size_t view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(dst_tensor, dims, view_start, tensor_dims, tensor_strides, dst_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
for (size_t index = 0; index < tensor_count; ++index)
{
const size_t tensor_byte_offset = ownGetFlatByteOffset(index, dims, tensor_dims, tensor_strides);
switch(fmt)
{
case TT_Q78:
{
const vx_int16 res = *(vx_int16*)((char*)dst_data + tensor_byte_offset);
const vx_int16 val = *(vx_int16*)((char*)src_data + tensor_byte_offset);
const int16_t ref = *((vx_int16*)lut_data + (size_t)((int32_t)lut_offset + (int32_t)val));
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu] : %f (raw: %d), LUT[%d + %u]: %f (raw: %d), res[%zu]: %f (raw: %d) }\n",
tensor_byte_offset / sizeof(vx_int16), val / 256.f, val,
val, lut_offset, ref / 256.f, ref,
tensor_byte_offset / sizeof(vx_int16), res / 256.f, res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
case TT_U8:
{
const vx_uint8 res = *(vx_uint8*)((char*)dst_data + tensor_byte_offset);
const vx_uint8 val = *(vx_uint8*)((char*)src_data + tensor_byte_offset);
const uint8_t ref = *((vx_uint8*)lut_data + (size_t)((int32_t)lut_offset + (int32_t)val));
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu] : %d, LUT[%d + %u]: %d, res[%zu]: %d }\n",
tensor_byte_offset / sizeof(vx_uint8), val,
val, lut_offset, ref,
tensor_byte_offset / sizeof(vx_uint8), res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
default: assert(0);
}
}
}
VX_CALL(vxReleaseTensor(&src_tensor));
VX_CALL(vxReleaseTensor(&dst_tensor));
VX_CALL(vxReleaseLUT(&lut));
EXPECT_EQ_PTR(NULL, src_tensor);
EXPECT_EQ_PTR(NULL, dst_tensor);
EXPECT_EQ_PTR(NULL, lut);
ct_free_mem(src_data);
ct_free_mem(dst_data);
}
ct_free_mem(tensor_dims);
ct_free_mem(tensor_strides);
}
/****************************************************************************
* *
* Test vxTensorTransposeNode *
* *
***************************************************************************/
typedef struct
{
const char * name;
enum TestTensorDF fmt;
} test_tensor_transpose_op_arg;
TEST_WITH_ARG(TensorOp, testvxTensorTranspose, test_tensor_transpose_op_arg,
ARG("Q78_TRANSPOSE", TT_Q78),
ARG("U8_TRANSPOSE", TT_U8),
ARG("S8_TRANSPOSE", TT_S8),
)
{
const vx_context context = context_->vx_context_;
const enum TestTensorDF fmt = arg_->fmt;
assert(fmt == TT_Q78 || fmt == TT_U8 || fmt == TT_S8);
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type = 0;
vx_uint8 fixed_point_position = 0;
vx_size sizeof_data_type = 0;
ownUnpackFormat(fmt, &data_type, &fixed_point_position, &sizeof_data_type);
size_t * const src_dims = ct_alloc_mem(sizeof(*src_dims) * max_dims);
size_t * const dst_dims = ct_alloc_mem(sizeof(*dst_dims) * max_dims);
ASSERT(src_dims && dst_dims);
//TODO: fix the following comment after its settlted :)
// The way we implement the transposed query is simply swapping 2 of the
// relevant dims strides in swizzled strides compared to original ones
size_t * const src_strides = ct_alloc_mem(sizeof(*src_strides) * max_dims);
size_t * const dst_strides = ct_alloc_mem(sizeof(*dst_strides) * max_dims);
size_t * const ref_strides = ct_alloc_mem(sizeof(*ref_strides) * max_dims);
ASSERT(src_strides && dst_strides && ref_strides);
//TODO: @Tomer, should swapping a dim with itself be acceptable?
// The strategy is a simple one: For each of the 1..max_dims supported,
// we test all n^2 possible 2 dim combos for transposition.
// We choose to do so since $sum_{n=1}^{max_dims} n ^2 ~ O(n^3)# which
// isn't much for any practical number of supported dimensions.
// An alternative method could be similar to the one used in the
// Elementwise Op tests, where we ran TEST_TENSOR_NUM_ITERATIONS iters
// with random 2 dim choice. But for practical values of max_dims (~6?)
// it's hardly any different.
//TODO: @Tomer, do you preffer the ranom approach?
//
// Not that we still chose to use the psuedo random data rather than
// sequential values since the S8/U8 types would force a short repeating
// pattern making it slightly harder to debug and possibly missing
// perfectly aligned tranpose cases, however unlikely... :)
// Note that iter is the inner loop, so that if two implementations support
// D1 and D2 dims resp. the same (pseudo-random) values would be used when
// testing the common min(D1, D2) dimensions.
//TODO: If a single dim cannot be "transposed with itself" (copy/NOP for virt), start from 2
for (vx_size dims = 1; dims <= max_dims; ++dims)
for (vx_size transpose_dim0 = 0; transpose_dim0 < dims; ++transpose_dim0)
for (vx_size transpose_dim1 = 1; transpose_dim1 < dims; ++transpose_dim1)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("dims: %zu, transpose_dim0: %zu, transpose_dim1: %zu\n", dims, transpose_dim0, transpose_dim1);
fflush(stdout);
}
// First step is to get some random dim sizes, calc the strides and create the tensors.
{
for (vx_size i = 0; i < dims; ++i)
{
src_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
dst_dims[i] = src_dims[i];
src_strides[i] = i ? src_strides[i - 1] * src_dims[i - 1] : sizeof_data_type;
ref_strides[i] = src_strides[i];
}
dst_dims[transpose_dim1] = src_dims[transpose_dim0];
dst_dims[transpose_dim0] = src_dims[transpose_dim1];
ref_strides[transpose_dim1] = src_strides[transpose_dim0];
ref_strides[transpose_dim0] = src_strides[transpose_dim1];
for (vx_size i = 0; i < dims; ++i)
{
dst_strides[i] = i ? dst_strides[i - 1] * dst_dims[i - 1] : sizeof_data_type;
}
}
vx_tensor src_tensor = vxCreateTensor(context, dims, src_dims, data_type, fixed_point_position);
vx_tensor dst_tensor = vxCreateTensor(context, dims, dst_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(src_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(dst_tensor, VX_TYPE_TENSOR);
const size_t bytes = src_dims[dims - 1] * src_strides[dims - 1];
const size_t count = bytes / sizeof_data_type;
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t src_dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", src_dims[i]); } printf(" },\n");
printf("\t dst_dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", dst_dims[i]); } printf(" },\n");
printf(" count: %zu, bytes: %zu,\n", count, bytes);
printf(" tensor_transpose_dims: { %zu, %zu }\n", transpose_dim0, transpose_dim1);
printf("\t }\n");
}
//TODO: This is pretty wasteful as it's repeating a lot of work per iteration:
// Both in the repeated malloc + free and inefficient data population
// which discards much of the random data, only using a part of it.
// Second step is to allocate the input and output data locations and populate the inputs.
void * const src_data = ct_alloc_mem(bytes);
void * const dst_data = ct_alloc_mem(bytes);
ASSERT(src_data && dst_data);
{ //TODO: ownTestInitTensors(..) ?
ownFillRandData(fmt, &rng, count, src_data);
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(src_tensor, dims, view_start, src_dims, src_strides, src_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
vx_graph graph = vxCreateGraph(context);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_node node = vxTensorTransposeNode(graph, src_tensor, dst_tensor, transpose_dim0, transpose_dim1);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the results
{
const size_t view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(dst_tensor, dims, view_start, dst_dims, dst_strides, dst_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
for (size_t index = 0; index < count; ++index)
{
const size_t res_byte_offset = ownGetFlatByteOffset(index, dims, dst_dims, dst_strides);
const size_t ref_byte_offset = ownGetFlatByteOffset(index, dims, dst_dims, ref_strides);
//TODO: can unify the following to avoid the copy pasta...
switch(fmt)
{
case TT_Q78:
{
const vx_int16 res = *(vx_int16*)((char*)dst_data + res_byte_offset);
const vx_int16 ref = *(vx_int16*)((char*)src_data + ref_byte_offset);
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu]: %f (raw: %d), dst[%zu]: %f (raw: %d) }\n",
ref_byte_offset / sizeof(vx_int16), ref / 256.f, ref,
res_byte_offset / sizeof(vx_int16), res / 256.f, res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
case TT_U8:
{
const vx_uint8 res = *(vx_uint8*)((char*)dst_data + res_byte_offset);
const vx_uint8 ref = *(vx_uint8*)((char*)src_data + ref_byte_offset);
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu]: %d, dst[%zu]: %d }\n",
ref_byte_offset / sizeof(vx_uint8), ref,
res_byte_offset / sizeof(vx_uint8), res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
case TT_S8:
{
const vx_int8 res = *(vx_int8*)((char*)dst_data + res_byte_offset);
const vx_int8 ref = *(vx_int8*)((char*)src_data + ref_byte_offset);
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu]: %d, dst[%zu]: %d }\n",
ref_byte_offset / sizeof(vx_int8), ref,
res_byte_offset / sizeof(vx_int8), res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
default: assert(0);
}
}
}
VX_CALL(vxReleaseTensor(&src_tensor));
VX_CALL(vxReleaseTensor(&dst_tensor));
EXPECT_EQ_PTR(NULL, src_tensor);
EXPECT_EQ_PTR(NULL, dst_tensor);
ct_free_mem(src_data);
ct_free_mem(dst_data);
}
ct_free_mem(src_dims);
ct_free_mem(dst_dims);
ct_free_mem(src_strides);
ct_free_mem(dst_strides);
ct_free_mem(ref_strides);
}
TEST_WITH_ARG(TensorOp, testvxuTensorTranspose, test_tensor_transpose_op_arg,
ARG("Q78_TRANSPOSE", TT_Q78),
ARG("U8_TRANSPOSE", TT_U8),
ARG("S8_TRANSPOSE", TT_S8),
)
{
const vx_context context = context_->vx_context_;
const enum TestTensorDF fmt = arg_->fmt;
assert(fmt == TT_Q78 || fmt == TT_U8 || fmt == TT_S8);
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type = 0;
vx_uint8 fixed_point_position = 0;
vx_size sizeof_data_type = 0;
ownUnpackFormat(fmt, &data_type, &fixed_point_position, &sizeof_data_type);
size_t * const src_dims = ct_alloc_mem(sizeof(*src_dims) * max_dims);
size_t * const dst_dims = ct_alloc_mem(sizeof(*dst_dims) * max_dims);
ASSERT(src_dims && dst_dims);
//TODO: fix the following comment after its settlted :)
// The way we implement the transposed query is simply swapping 2 of the
// relevant dims strides in swizzled strides compared to original ones
size_t * const src_strides = ct_alloc_mem(sizeof(*src_strides) * max_dims);
size_t * const dst_strides = ct_alloc_mem(sizeof(*dst_strides) * max_dims);
size_t * const ref_strides = ct_alloc_mem(sizeof(*ref_strides) * max_dims);
ASSERT(src_strides && dst_strides && ref_strides);
//TODO: @Tomer, should swapping a dim with itself be acceptable?
// The strategy is a simple one: For each of the 1..max_dims supported,
// we test all n^2 possible 2 dim combos for transposition.
// We choose to do so since $sum_{n=1}^{max_dims} n ^2 ~ O(n^3)# which
// isn't much for any practical number of supported dimensions.
// An alternative method could be similar to the one used in the
// Elementwise Op tests, where we ran TEST_TENSOR_NUM_ITERATIONS iters
// with random 2 dim choice. But for practical values of max_dims (~6?)
// it's hardly any different.
//TODO: @Tomer, do you preffer the ranom approach?
//
// Not that we still chose to use the psuedo random data rather than
// sequential values since the S8/U8 types would force a short repeating
// pattern making it slightly harder to debug and possibly missing
// perfectly aligned tranpose cases, however unlikely... :)
// Note that iter is the inner loop, so that if two implementations support
// D1 and D2 dims resp. the same (pseudo-random) values would be used when
// testing the common min(D1, D2) dimensions.
//TODO: If a single dim cannot be "transposed with itself" (copy/NOP for virt), start from 2
for (vx_size dims = 1; dims <= max_dims; ++dims)
for (vx_size transpose_dim0 = 0; transpose_dim0 < dims; ++transpose_dim0)
for (vx_size transpose_dim1 = 1; transpose_dim1 < dims; ++transpose_dim1)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("dims: %zu, transpose_dim0: %zu, transpose_dim1: %zu\n", dims, transpose_dim0, transpose_dim1);
fflush(stdout);
}
// First step is to get some random dim sizes, calc the strides and create the tensors.
{
for (vx_size i = 0; i < dims; ++i)
{
src_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
dst_dims[i] = src_dims[i];
src_strides[i] = i ? src_strides[i - 1] * src_dims[i - 1] : sizeof_data_type;
ref_strides[i] = src_strides[i];
}
dst_dims[transpose_dim1] = src_dims[transpose_dim0];
dst_dims[transpose_dim0] = src_dims[transpose_dim1];
ref_strides[transpose_dim1] = src_strides[transpose_dim0];
ref_strides[transpose_dim0] = src_strides[transpose_dim1];
for (vx_size i = 0; i < dims; ++i)
{
dst_strides[i] = i ? dst_strides[i - 1] * dst_dims[i - 1] : sizeof_data_type;
}
}
vx_tensor src_tensor = vxCreateTensor(context, dims, src_dims, data_type, fixed_point_position);
vx_tensor dst_tensor = vxCreateTensor(context, dims, dst_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(src_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(dst_tensor, VX_TYPE_TENSOR);
const size_t bytes = src_dims[dims - 1] * src_strides[dims - 1];
const size_t count = bytes / sizeof_data_type;
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t src_dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", src_dims[i]); } printf(" },\n");
printf("\t dst_dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", dst_dims[i]); } printf(" },\n");
printf(" count: %zu, bytes: %zu,\n", count, bytes);
printf(" tensor_transpose_dims: { %zu, %zu }\n", transpose_dim0, transpose_dim1);
printf("\t }\n");
}
//TODO: This is pretty wasteful as it's repeating a lot of work per iteration:
// Both in the repeated malloc + free and inefficient data population
// which discards much of the random data, only using a part of it.
// Second step is to allocate the input and output data locations and populate the inputs.
void * const src_data = ct_alloc_mem(bytes);
void * const dst_data = ct_alloc_mem(bytes);
ASSERT(src_data && dst_data);
{ //TODO: ownTestInitTensors(..) ?
ownFillRandData(fmt, &rng, count, src_data);
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(src_tensor, dims, view_start, src_dims, src_strides, src_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
VX_CALL(vxuTensorTranspose(context, src_tensor, dst_tensor, transpose_dim0, transpose_dim1));
}
// Verify the results
{
const size_t view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(dst_tensor, dims, view_start, dst_dims, dst_strides, dst_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
for (size_t index = 0; index < count; ++index)
{
const size_t res_byte_offset = ownGetFlatByteOffset(index, dims, dst_dims, dst_strides);
const size_t ref_byte_offset = ownGetFlatByteOffset(index, dims, dst_dims, ref_strides);
//TODO: can unify the following to avoid the copy pasta...
switch(fmt)
{
case TT_Q78:
{
const vx_int16 res = *(vx_int16*)((char*)dst_data + res_byte_offset);
const vx_int16 ref = *(vx_int16*)((char*)src_data + ref_byte_offset);
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu]: %f (raw: %d), dst[%zu]: %f (raw: %d) }\n",
ref_byte_offset / sizeof(vx_int16), ref / 256.f, ref,
res_byte_offset / sizeof(vx_int16), res / 256.f, res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
case TT_U8:
{
const vx_uint8 res = *(vx_uint8*)((char*)dst_data + res_byte_offset);
const vx_uint8 ref = *(vx_uint8*)((char*)src_data + ref_byte_offset);
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu]: %d, dst[%zu]: %d }\n",
ref_byte_offset / sizeof(vx_uint8), ref,
res_byte_offset / sizeof(vx_uint8), res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
case TT_S8:
{
const vx_int8 res = *(vx_int8*)((char*)dst_data + res_byte_offset);
const vx_int8 ref = *(vx_int8*)((char*)src_data + ref_byte_offset);
if (res != ref)
{
printf("DIFF!!!\t\t{ src[%zu]: %d, dst[%zu]: %d }\n",
ref_byte_offset / sizeof(vx_int8), ref,
res_byte_offset / sizeof(vx_int8), res);
}
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR)
{
ASSERT_EQ_INT(res, ref);
}
else
{
EXPECT_EQ_INT(res, ref);
}
}
break;
default: assert(0);
}
}
}
VX_CALL(vxReleaseTensor(&src_tensor));
VX_CALL(vxReleaseTensor(&dst_tensor));
EXPECT_EQ_PTR(NULL, src_tensor);
EXPECT_EQ_PTR(NULL, dst_tensor);
ct_free_mem(src_data);
ct_free_mem(dst_data);
}
ct_free_mem(src_dims);
ct_free_mem(dst_dims);
ct_free_mem(src_strides);
ct_free_mem(dst_strides);
ct_free_mem(ref_strides);
}
/****************************************************************************
* *
* Test vxTensorConvertDepthNode *
* *
***************************************************************************/
typedef struct
{
const char * name;
enum vx_convert_policy_e policy;
enum TestTensorDF src_fmt;
enum TestTensorDF dst_fmt;
float offset;
float norm;
} test_tensor_convert_depth_op_arg;
//TODO: what kind of configs do we want to test? It doesn't have to be full width conversions
TEST_WITH_ARG(TensorOp, testvxTensorConvertDepth, test_tensor_convert_depth_op_arg,
ARG("DEPTH_CONVERT_SAT_Q78_TO_Q78_FULL", VX_CONVERT_POLICY_SATURATE, TT_Q78, TT_Q78, 0.f, 1.f),
ARG("DEPTH_CONVERT_SAT_Q78_TO_U8_FULL", VX_CONVERT_POLICY_SATURATE, TT_Q78, TT_U8, 128.f, 1.f),
ARG("DEPTH_CONVERT_SAT_Q78_TO_S8_FULL", VX_CONVERT_POLICY_SATURATE, TT_Q78, TT_S8, 0.f, 1.f),
ARG("DEPTH_CONVERT_SAT_U8_TO_Q78_FULL", VX_CONVERT_POLICY_SATURATE, TT_U8, TT_Q78, -128.f, 1.f),
ARG("DEPTH_CONVERT_SAT_U8_TO_U8_FULL", VX_CONVERT_POLICY_SATURATE, TT_U8, TT_U8, 0.f, 1.f),
ARG("DEPTH_CONVERT_SAT_U8_TO_S8_FULL", VX_CONVERT_POLICY_SATURATE, TT_U8, TT_S8, -128.f, 1.f),
ARG("DEPTH_CONVERT_SAT_S8_TO_Q78_FULL", VX_CONVERT_POLICY_SATURATE, TT_S8, TT_Q78, 0.f, 1.f),
ARG("DEPTH_CONVERT_SAT_S8_TO_U8_FULL", VX_CONVERT_POLICY_SATURATE, TT_S8, TT_U8, 128.f, 1.f),
ARG("DEPTH_CONVERT_SAT_S8_TO_S8_FULL", VX_CONVERT_POLICY_SATURATE, TT_S8, TT_S8, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_Q78_TO_Q78_FULL", VX_CONVERT_POLICY_WRAP, TT_Q78, TT_Q78, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_Q78_TO_U8_FULL", VX_CONVERT_POLICY_WRAP, TT_Q78, TT_U8, 128.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_Q78_TO_S8_FULL", VX_CONVERT_POLICY_WRAP, TT_Q78, TT_S8, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_U8_TO_Q78_FULL", VX_CONVERT_POLICY_WRAP, TT_U8, TT_Q78, -128.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_U8_TO_U8_FULL", VX_CONVERT_POLICY_WRAP, TT_U8, TT_U8, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_U8_TO_S8_FULL", VX_CONVERT_POLICY_WRAP, TT_U8, TT_S8, -128.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_S8_TO_Q78_FULL", VX_CONVERT_POLICY_WRAP, TT_S8, TT_Q78, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_S8_TO_U8_FULL", VX_CONVERT_POLICY_WRAP, TT_S8, TT_U8, 128.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_S8_TO_S8_FULL", VX_CONVERT_POLICY_WRAP, TT_S8, TT_S8, 0.f, 1.f),
)
{
const vx_context context = context_->vx_context_;
const enum TestTensorDF src_fmt = arg_->src_fmt;
const enum TestTensorDF dst_fmt = arg_->dst_fmt;
assert(src_fmt == TT_Q78 || src_fmt == TT_U8 || src_fmt == TT_S8);
assert(dst_fmt == TT_Q78 || dst_fmt == TT_U8 || dst_fmt == TT_S8);
const enum vx_convert_policy_e policy = arg_->policy;
assert(policy == VX_CONVERT_POLICY_SATURATE || policy == VX_CONVERT_POLICY_WRAP);
const float offset = arg_->offset;
const float norm = arg_->norm;
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum src_data_type = 0;
vx_enum dst_data_type = 0;
vx_uint8 src_fixed_point_position = 0;
vx_uint8 dst_fixed_point_position = 0;
vx_size src_sizeof_data_type = 0;
vx_size dst_sizeof_data_type = 0;
ownUnpackFormat(src_fmt, &src_data_type, &src_fixed_point_position, &src_sizeof_data_type);
ownUnpackFormat(dst_fmt, &dst_data_type, &dst_fixed_point_position, &dst_sizeof_data_type);
size_t * const tensor_dims = ct_alloc_mem(sizeof(*tensor_dims) * max_dims);
size_t * const src_tensor_strides = ct_alloc_mem(sizeof(*src_tensor_strides) * max_dims);
size_t * const dst_tensor_strides = ct_alloc_mem(sizeof(*dst_tensor_strides) * max_dims);
ASSERT(tensor_dims && src_tensor_strides && dst_tensor_strides);
//TODO: what's the testing strategy here? missing desc.
// Note that iter is the inner loop, so that if two implementations support
// D1 and D2 dims resp. the same (pseudo-random) values would be used when
// testing the common min(D1, D2) dimensions.
for (vx_size dims = 1; dims <= max_dims; ++dims)
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("dims #: %zu,\titer #: %d\n", dims, iter);
fflush(stdout);
}
// First step is to get some random dim sizes, calc the strides and create the tensors.
for (vx_size i = 0; i < dims; ++i)
{
tensor_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
src_tensor_strides[i] = i ? src_tensor_strides[i-1] * tensor_dims[i-1] : src_sizeof_data_type;
dst_tensor_strides[i] = i ? dst_tensor_strides[i-1] * tensor_dims[i-1] : dst_sizeof_data_type;
}
vx_tensor src_tensor = vxCreateTensor(context, dims, tensor_dims, src_data_type, src_fixed_point_position);
vx_tensor dst_tensor = vxCreateTensor(context, dims, tensor_dims, dst_data_type, dst_fixed_point_position);
ASSERT_VX_OBJECT(src_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(dst_tensor, VX_TYPE_TENSOR);
const size_t src_tensor_bytes = tensor_dims[dims-1] * src_tensor_strides[dims-1];
const size_t dst_tensor_bytes = tensor_dims[dims-1] * dst_tensor_strides[dims-1];
const size_t count = src_tensor_bytes / src_sizeof_data_type;
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t tensor_dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", tensor_dims[i]); } printf(" }, \n");
printf("\t }\n");
}
void * const src_data = ct_alloc_mem(src_tensor_bytes);
void * const dst_data = ct_alloc_mem(dst_tensor_bytes);
ASSERT(src_data && dst_data);
{ //TODO: ownTestInitTensors(..) ?
ownFillRandData(src_fmt, &rng, count, src_data);
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(src_tensor, dims, view_start, tensor_dims, src_tensor_strides, src_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
vx_graph graph = vxCreateGraph(context);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_scalar norm_sc = vxCreateScalar(context, VX_TYPE_FLOAT32, &norm);
vx_scalar offset_sc = vxCreateScalar(context, VX_TYPE_FLOAT32, &offset);
ASSERT_VX_OBJECT(norm_sc, VX_TYPE_SCALAR);
ASSERT_VX_OBJECT(offset_sc, VX_TYPE_SCALAR);
vx_node node = vxTensorConvertDepthNode(graph, src_tensor, policy, norm_sc, offset_sc, dst_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxReleaseScalar(&norm_sc));
VX_CALL(vxReleaseScalar(&offset_sc));
EXPECT_EQ_PTR(NULL, norm_sc);
EXPECT_EQ_PTR(NULL, offset_sc);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
// Verify the reuslts
{
const size_t view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(dst_tensor, dims, view_start, tensor_dims, dst_tensor_strides, dst_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
const float scale = 1.f / norm;
const bool wrap = policy == VX_CONVERT_POLICY_WRAP;
for (size_t index = 0; index < count; ++index)
{
const size_t src_tensor_byte_offset = ownGetFlatByteOffset(index, dims, tensor_dims, src_tensor_strides);
const size_t dst_tensor_byte_offset = ownGetFlatByteOffset(index, dims, tensor_dims, dst_tensor_strides);
float tmp = .0f;
switch(src_fmt)
{
case TT_Q78:
tmp = *(vx_int16*)((char*)src_data + src_tensor_byte_offset);
tmp /= Q78_SCALE;
break;
case TT_U8:
tmp = *(vx_uint8*)((char*)src_data + src_tensor_byte_offset);
break;
case TT_S8:
tmp = *(vx_int8*)((char*)src_data + src_tensor_byte_offset);
break;
default: assert(0);
}
tmp = (tmp - offset) * scale;
//TODO: missing allowed eps
//TODO: missing diff printf
switch(dst_fmt)
{
case TT_Q78:
{
tmp *= Q78_SCALE;
vx_int16 ref = wrap ? (vx_int16)tmp : CLAMP(tmp, INT16_MIN, INT16_MAX); //TODO: cast issue?
vx_int16 res = *(vx_int16*)((char*)dst_data + dst_tensor_byte_offset);
if (res != ref) printf("DIFF!!!\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT_EQ_INT(res, ref); else EXPECT_EQ_INT(res, ref);
}
break;
case TT_U8:
{
vx_uint8 ref = wrap ? (vx_uint8)tmp : CLAMP(tmp, 0, UINT8_MAX); // CLAMP not really needed
vx_uint8 res = *(vx_uint8*)((char*)dst_data + dst_tensor_byte_offset);
if (res != ref) printf("DIFF!!!\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT_EQ_INT(res, ref); else EXPECT_EQ_INT(res, ref);
}
break;
case TT_S8:
{
vx_int8 ref = wrap ? (vx_int8)tmp : (vx_int8)CLAMP(tmp, INT8_MIN, INT8_MAX); //TODO: cast issue?
vx_int8 res = *(vx_int8*)((char*)dst_data + dst_tensor_byte_offset);
if (res != ref) printf("DIFF!!!\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT_EQ_INT(res, ref); else EXPECT_EQ_INT(res, ref);
}
break;
default: assert(0);
}
}
}
VX_CALL(vxReleaseTensor(&src_tensor));
VX_CALL(vxReleaseTensor(&dst_tensor));
EXPECT_EQ_PTR(NULL, src_tensor);
EXPECT_EQ_PTR(NULL, dst_tensor);
ct_free_mem(src_data);
ct_free_mem(dst_data);
}
ct_free_mem(tensor_dims);
ct_free_mem(src_tensor_strides);
ct_free_mem(dst_tensor_strides);
}
TEST_WITH_ARG(TensorOp, testvxuTensorConvertDepth, test_tensor_convert_depth_op_arg,
ARG("DEPTH_CONVERT_SAT_Q78_TO_Q78_FULL", VX_CONVERT_POLICY_SATURATE, TT_Q78, TT_Q78, 0.f, 1.f),
ARG("DEPTH_CONVERT_SAT_Q78_TO_U8_FULL", VX_CONVERT_POLICY_SATURATE, TT_Q78, TT_U8, 128.f, 1.f),
ARG("DEPTH_CONVERT_SAT_Q78_TO_S8_FULL", VX_CONVERT_POLICY_SATURATE, TT_Q78, TT_S8, 0.f, 1.f),
ARG("DEPTH_CONVERT_SAT_U8_TO_Q78_FULL", VX_CONVERT_POLICY_SATURATE, TT_U8, TT_Q78, -128.f, 1.f),
ARG("DEPTH_CONVERT_SAT_U8_TO_U8_FULL", VX_CONVERT_POLICY_SATURATE, TT_U8, TT_U8, 0.f, 1.f),
ARG("DEPTH_CONVERT_SAT_U8_TO_S8_FULL", VX_CONVERT_POLICY_SATURATE, TT_U8, TT_S8, -128.f, 1.f),
ARG("DEPTH_CONVERT_SAT_S8_TO_Q78_FULL", VX_CONVERT_POLICY_SATURATE, TT_S8, TT_Q78, 0.f, 1.f),
ARG("DEPTH_CONVERT_SAT_S8_TO_U8_FULL", VX_CONVERT_POLICY_SATURATE, TT_S8, TT_U8, 128.f, 1.f),
ARG("DEPTH_CONVERT_SAT_S8_TO_S8_FULL", VX_CONVERT_POLICY_SATURATE, TT_S8, TT_S8, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_Q78_TO_Q78_FULL", VX_CONVERT_POLICY_WRAP, TT_Q78, TT_Q78, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_Q78_TO_U8_FULL", VX_CONVERT_POLICY_WRAP, TT_Q78, TT_U8, 128.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_Q78_TO_S8_FULL", VX_CONVERT_POLICY_WRAP, TT_Q78, TT_S8, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_U8_TO_Q78_FULL", VX_CONVERT_POLICY_WRAP, TT_U8, TT_Q78, -128.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_U8_TO_U8_FULL", VX_CONVERT_POLICY_WRAP, TT_U8, TT_U8, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_U8_TO_S8_FULL", VX_CONVERT_POLICY_WRAP, TT_U8, TT_S8, -128.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_S8_TO_Q78_FULL", VX_CONVERT_POLICY_WRAP, TT_S8, TT_Q78, 0.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_S8_TO_U8_FULL", VX_CONVERT_POLICY_WRAP, TT_S8, TT_U8, 128.f, 1.f),
ARG("DEPTH_CONVERT_WRAP_S8_TO_S8_FULL", VX_CONVERT_POLICY_WRAP, TT_S8, TT_S8, 0.f, 1.f),
)
{
const vx_context context = context_->vx_context_;
const enum TestTensorDF src_fmt = arg_->src_fmt;
const enum TestTensorDF dst_fmt = arg_->dst_fmt;
assert(src_fmt == TT_Q78 || src_fmt == TT_U8 || src_fmt == TT_S8);
assert(dst_fmt == TT_Q78 || dst_fmt == TT_U8 || dst_fmt == TT_S8);
const enum vx_convert_policy_e policy = arg_->policy;
assert(policy == VX_CONVERT_POLICY_SATURATE || policy == VX_CONVERT_POLICY_WRAP);
const float offset = arg_->offset;
const float norm = arg_->norm;
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum src_data_type = 0;
vx_enum dst_data_type = 0;
vx_uint8 src_fixed_point_position = 0;
vx_uint8 dst_fixed_point_position = 0;
vx_size src_sizeof_data_type = 0;
vx_size dst_sizeof_data_type = 0;
ownUnpackFormat(src_fmt, &src_data_type, &src_fixed_point_position, &src_sizeof_data_type);
ownUnpackFormat(dst_fmt, &dst_data_type, &dst_fixed_point_position, &dst_sizeof_data_type);
size_t * const tensor_dims = ct_alloc_mem(sizeof(*tensor_dims) * max_dims);
size_t * const src_tensor_strides = ct_alloc_mem(sizeof(*src_tensor_strides) * max_dims);
size_t * const dst_tensor_strides = ct_alloc_mem(sizeof(*dst_tensor_strides) * max_dims);
ASSERT(tensor_dims && src_tensor_strides && dst_tensor_strides);
//TODO: what's the testing strategy here? missing desc.
// Note that iter is the inner loop, so that if two implementations support
// D1 and D2 dims resp. the same (pseudo-random) values would be used when
// testing the common min(D1, D2) dimensions.
for (vx_size dims = 1; dims <= max_dims; ++dims)
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("dims #: %zu,\titer #: %d\n", dims, iter);
fflush(stdout);
}
// First step is to get some random dim sizes, calc the strides and create the tensors.
for (vx_size i = 0; i < dims; ++i)
{
tensor_dims[i] = (size_t)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
src_tensor_strides[i] = i ? src_tensor_strides[i-1] * tensor_dims[i-1] : src_sizeof_data_type;
dst_tensor_strides[i] = i ? dst_tensor_strides[i-1] * tensor_dims[i-1] : dst_sizeof_data_type;
}
vx_tensor src_tensor = vxCreateTensor(context, dims, tensor_dims, src_data_type, src_fixed_point_position);
vx_tensor dst_tensor = vxCreateTensor(context, dims, tensor_dims, dst_data_type, dst_fixed_point_position);
ASSERT_VX_OBJECT(src_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(dst_tensor, VX_TYPE_TENSOR);
const size_t src_tensor_bytes = tensor_dims[dims-1] * src_tensor_strides[dims-1];
const size_t dst_tensor_bytes = tensor_dims[dims-1] * dst_tensor_strides[dims-1];
const size_t count = src_tensor_bytes / src_sizeof_data_type;
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t tensor_dims: { "); for (size_t i = 0; i < dims; ++i) { printf("%zu, ", tensor_dims[i]); } printf(" }, \n");
printf("\t }\n");
}
void * const src_data = ct_alloc_mem(src_tensor_bytes);
void * const dst_data = ct_alloc_mem(dst_tensor_bytes);
ASSERT(src_data && dst_data);
{ //TODO: ownTestInitTensors(..) ?
ownFillRandData(src_fmt, &rng, count, src_data);
vx_size view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(src_tensor, dims, view_start, tensor_dims, src_tensor_strides, src_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Third step is creating, running and disposing of the graph.
{
vx_scalar norm_sc = vxCreateScalar(context, VX_TYPE_FLOAT32, &norm);
vx_scalar offset_sc = vxCreateScalar(context, VX_TYPE_FLOAT32, &offset);
ASSERT_VX_OBJECT(norm_sc, VX_TYPE_SCALAR);
ASSERT_VX_OBJECT(offset_sc, VX_TYPE_SCALAR);
VX_CALL(vxuTensorConvertDepth(context, src_tensor, policy, norm_sc, offset_sc, dst_tensor));
VX_CALL(vxReleaseScalar(&norm_sc));
VX_CALL(vxReleaseScalar(&offset_sc));
EXPECT_EQ_PTR(NULL, norm_sc);
EXPECT_EQ_PTR(NULL, offset_sc);
}
// Verify the reuslts
{
const size_t view_start[MAX_TENSOR_DIMS] = { 0 };
VX_CALL(vxCopyTensorPatch(dst_tensor, dims, view_start, tensor_dims, dst_tensor_strides, dst_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
const float scale = 1.f / norm;
const bool wrap = policy == VX_CONVERT_POLICY_WRAP;
for (size_t index = 0; index < count; ++index)
{
const size_t src_tensor_byte_offset = ownGetFlatByteOffset(index, dims, tensor_dims, src_tensor_strides);
const size_t dst_tensor_byte_offset = ownGetFlatByteOffset(index, dims, tensor_dims, dst_tensor_strides);
float tmp = .0f;
switch(src_fmt)
{
case TT_Q78:
tmp = *(vx_int16*)((char*)src_data + src_tensor_byte_offset);
tmp /= Q78_SCALE;
break;
case TT_U8:
tmp = *(vx_uint8*)((char*)src_data + src_tensor_byte_offset);
break;
case TT_S8:
tmp = *(vx_int8*)((char*)src_data + src_tensor_byte_offset);
break;
default: assert(0);
}
tmp = (tmp - offset) * scale;
//TODO: missing allowed eps
//TODO: missing diff printf
switch(dst_fmt)
{
case TT_Q78:
{
tmp *= Q78_SCALE;
vx_int16 ref = wrap ? (vx_int16)tmp : CLAMP(tmp, INT16_MIN, INT16_MAX); //TODO: cast issue?
vx_int16 res = *(vx_int16*)((char*)dst_data + dst_tensor_byte_offset);
if (res != ref) printf("DIFF!!!\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT_EQ_INT(res, ref); else EXPECT_EQ_INT(res, ref);
}
break;
case TT_U8:
{
vx_uint8 ref = wrap ? (vx_uint8)tmp : CLAMP(tmp, 0, UINT8_MAX); // CLAMP not really needed
vx_uint8 res = *(vx_uint8*)((char*)dst_data + dst_tensor_byte_offset);
if (res != ref) printf("DIFF!!!\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT_EQ_INT(res, ref); else EXPECT_EQ_INT(res, ref);
}
break;
case TT_S8:
{
vx_int8 ref = wrap ? (vx_int8)tmp : (vx_int8)CLAMP(tmp, INT8_MIN, INT8_MAX); //TODO: cast issue?
vx_int8 res = *(vx_int8*)((char*)dst_data + dst_tensor_byte_offset);
if (res != ref) printf("DIFF!!!\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT_EQ_INT(res, ref); else EXPECT_EQ_INT(res, ref);
}
break;
default: assert(0);
}
}
}
VX_CALL(vxReleaseTensor(&src_tensor));
VX_CALL(vxReleaseTensor(&dst_tensor));
EXPECT_EQ_PTR(NULL, src_tensor);
EXPECT_EQ_PTR(NULL, dst_tensor);
ct_free_mem(src_data);
ct_free_mem(dst_data);
}
ct_free_mem(tensor_dims);
ct_free_mem(src_tensor_strides);
ct_free_mem(dst_tensor_strides);
}
/****************************************************************************
* *
* Test vxTensorMatrixMultiplyNode *
* *
***************************************************************************/
static void ownTensorMatrixMultiply(
enum TestTensorDF fmt,
const void * a_data, const vx_size * a_dims, const vx_size * a_strides, bool a_transposed,
const void * b_data, const vx_size * b_dims, const vx_size * b_strides, bool b_transposed,
const void * c_data, const vx_size * c_dims, const vx_size * c_strides, bool c_transposed,
void * ref_data, const vx_size * out_dims, const vx_size * out_strides)
{
const vx_size aa_strides[2] = { a_strides[a_transposed], a_strides[1-a_transposed] };
const vx_size bb_strides[2] = { b_strides[b_transposed], b_strides[1-b_transposed] };
const vx_size cc_strides[2] = { c_strides[c_transposed], c_strides[1-c_transposed] };
size_t common_dim = a_dims[a_transposed];
switch(fmt)
{
case TT_Q78:
{
for (size_t y = 0; y < out_dims[1]; ++y)
for (size_t x = 0; x < out_dims[0]; ++x)
{
int_fast32_t accum = 0;
for (size_t i = 0; i < common_dim; ++i)
{
int_fast32_t a_val = *(vx_int16*)((char*)a_data + aa_strides[1] * y + aa_strides[0] * i);
int_fast32_t b_val = *(vx_int16*)((char*)b_data + bb_strides[1] * i + bb_strides[0] * x);
accum += a_val * b_val;
}
if (c_data)
{
accum += *(vx_int16*)((char*)c_data + cc_strides[1] * y + cc_strides[0] * x) * Q78_SCALE;
}
*(vx_int16*)((char*)ref_data + out_strides[1] * y + out_strides[0] * x) =
CLAMP((accum + Q78_HALF) / Q78_SCALE, INT16_MIN, INT16_MAX);
}
}
break;
case TT_U8:
{
for (size_t y = 0; y < out_dims[1]; ++y)
for (size_t x = 0; x < out_dims[0]; ++x)
{
int_fast32_t accum = 0;
for (size_t i = 0; i < common_dim; ++i)
{
int_fast32_t a_val = *(vx_uint8*)((char*)a_data + aa_strides[1] * y + aa_strides[0] * i);
int_fast32_t b_val = *(vx_uint8*)((char*)b_data + bb_strides[1] * i + bb_strides[0] * x);
accum += a_val * b_val;
}
if (c_data)
{
accum += *(vx_uint8*)((char*)c_data + cc_strides[1] * y + cc_strides[0] * x);
}
*(vx_uint8*)((char*)ref_data + out_strides[1] * y + out_strides[0] * x) =
CLAMP(accum, 0, UINT8_MAX);
}
}
break;
case TT_S8:
{
for (size_t y = 0; y < out_dims[1]; ++y)
for (size_t x = 0; x < out_dims[0]; ++x)
{
int_fast32_t accum = 0;
for (size_t i = 0; i < common_dim; ++i)
{
int_fast32_t a_val = *(vx_int8*)((char*)a_data + aa_strides[1] * y + aa_strides[0] * i);
int_fast32_t b_val = *(vx_int8*)((char*)b_data + bb_strides[1] * i + bb_strides[0] * x);
accum += a_val * b_val;
}
if (c_data)
{
accum += *(vx_int8*)((char*)c_data + cc_strides[1] * y + cc_strides[0] * x);
}
*(vx_int8*)((char*)ref_data + out_strides[1] * y + out_strides[0] * x) =
CLAMP(accum, INT8_MIN, INT8_MAX);
}
}
break;
default:
assert(0);
}
}
typedef struct
{
const char * name;
enum TestTensorDF fmt;
bool c_present;
bool a_transposed;
bool b_transposed;
bool c_transposed; // ingored unless c_present
} test_tensor_matrix_multiply_op_arg;
//TODO: what kind of configs do we want to test? It doesn't have to be full width conversions
#define TT_TENSOR_MAD_BASE(S_,FMT_,C_PRESENT_,A_T_,B_T_,C_T_) \
ARG("MAD_"S_, TT_ ## FMT_,C_PRESENT_,A_T_,B_T_,C_T_),
#define TT_TENSOR_MAD_B(S_,FMT_,A_,B_) \
TT_TENSOR_MAD_BASE(S_,FMT_,false,A_,B_,false) \
TT_TENSOR_MAD_BASE(S_"_C",FMT_,true,A_,B_,false) \
TT_TENSOR_MAD_BASE(S_"_CT",FMT_,true,A_,B_,true)
#define TT_TENSOR_MAD_A(S_,FMT_,A_) \
TT_TENSOR_MAD_B(S_ "_B",FMT_,A_,false) \
TT_TENSOR_MAD_B(S_ "_BT",FMT_,A_,true)
#define TT_TENSOR_MAD_0(FMT_) \
TT_TENSOR_MAD_A(#FMT_"_A", FMT_,false) \
TT_TENSOR_MAD_A(#FMT_"_AT",FMT_,true)
#define TT_TENSOR_MAD_ALL() \
TT_TENSOR_MAD_0(Q78) \
TT_TENSOR_MAD_0(U8) \
TT_TENSOR_MAD_0(S8)
TEST_WITH_ARG(TensorOp, testvxTensorMatrixMultiply, test_tensor_matrix_multiply_op_arg,
TT_TENSOR_MAD_ALL()
)
{
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type = 0;
vx_uint8 fixed_point_position = 0;
vx_size sizeof_data_type = 0;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter); fflush(stdout);
}
const vx_size m = (vx_size)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
const vx_size n = (vx_size)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
const vx_size k = (vx_size)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
// Not that unlike common GEMM, here we do not update an existing c but
// output to a different tensor!
const vx_size a_dims[2] = { arg_->a_transposed ? m : n, arg_->a_transposed ? n : m };
const vx_size b_dims[2] = { arg_->b_transposed ? n : k, arg_->b_transposed ? k : n };
const vx_size c_dims[2] = { arg_->c_transposed ? m : k, arg_->c_transposed ? k : m };
const vx_size out_dims[2] = { k, m };
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t a_dims: { %zu, %zu },\n", a_dims[0], a_dims[1]);
printf("\t b_dims: { %zu, %zu },\n", b_dims[0], b_dims[1]);
if (arg_->c_present)
printf("\t c_dims: { %zu, %zu },\n", c_dims[0], c_dims[1]);
printf("\t out_dims: { %zu, %zu },\n", out_dims[0], out_dims[1]);
printf("\t }\n");
}
const vx_size a_strides[2] = { sizeof_data_type, sizeof_data_type * a_dims[0] };
const vx_size b_strides[2] = { sizeof_data_type, sizeof_data_type * b_dims[0] };
const vx_size c_strides[2] = { sizeof_data_type, sizeof_data_type * c_dims[0] };
const vx_size out_strides[2] = { sizeof_data_type, sizeof_data_type * out_dims[0] };
void * a_data = ct_alloc_mem(m * n * sizeof_data_type);
void * b_data = ct_alloc_mem(n * k * sizeof_data_type);
void * c_data = arg_->c_present ? ct_alloc_mem(m * k * sizeof_data_type) : NULL;
void * out_data = ct_alloc_mem(m * k * sizeof_data_type);
void * ref_data = ct_alloc_mem(m * k * sizeof_data_type);
ASSERT(a_data && b_data && (!arg_->c_present || c_data) && out_data && ref_data);
// Since we check the sum of products here, and te accumulator is only
// supposed to be 32 bits, we need smaller values so that the intermidiate
// results don't exceed it.
ownFillSmallRandData(arg_->fmt, &rng, m * n, a_dims[0] + 1, a_data);
ownFillSmallRandData(arg_->fmt, &rng, n * k, a_dims[0] + 1, b_data);
if (arg_->c_present) { ownFillSmallRandData(arg_->fmt, &rng, m * k, a_dims[0] + 1, c_data); }
vx_tensor a_tensor = vxCreateTensor(context_->vx_context_, 2, a_dims, data_type, fixed_point_position);
vx_tensor b_tensor = vxCreateTensor(context_->vx_context_, 2, b_dims, data_type, fixed_point_position);
vx_tensor c_tensor = arg_->c_present ? vxCreateTensor(context_->vx_context_, 2, c_dims, data_type, fixed_point_position) : NULL;
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, 2, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(a_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(b_tensor, VX_TYPE_TENSOR);
if (arg_->c_present)
{
ASSERT_VX_OBJECT(c_tensor, VX_TYPE_TENSOR);
}
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
vx_size view_start[2] = { 0, 0 };
VX_CALL(vxCopyTensorPatch(a_tensor, 2, view_start, a_dims, a_strides, a_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
VX_CALL(vxCopyTensorPatch(b_tensor, 2, view_start, b_dims, b_strides, b_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
if (arg_->c_present)
{
VX_CALL(vxCopyTensorPatch(c_tensor, 2, view_start, c_dims, c_strides, c_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Create, run and dispose of the graph
{
vx_graph graph = vxCreateGraph(context_->vx_context_);
ASSERT_VX_OBJECT(graph, VX_TYPE_GRAPH);
vx_tensor_matrix_multiply_params_t params = { arg_->a_transposed, arg_->b_transposed, arg_->c_transposed };
vx_node node = vxTensorMatrixMultiplyNode(graph, a_tensor, b_tensor, c_tensor, &params, out_tensor);
ASSERT_VX_OBJECT(node, VX_TYPE_NODE);
VX_CALL(vxReleaseNode(&node));
EXPECT_EQ_PTR(NULL, node);
VX_CALL(vxVerifyGraph(graph));
VX_CALL(vxProcessGraph(graph));
VX_CALL(vxReleaseGraph(&graph));
EXPECT_EQ_PTR(NULL, graph);
}
{
ownTensorMatrixMultiply(
arg_->fmt,
a_data, a_dims, a_strides, arg_->a_transposed,
b_data, b_dims, b_strides, arg_->b_transposed,
c_data, c_dims, c_strides, arg_->c_transposed,
ref_data, out_dims, out_strides);
VX_CALL(vxCopyTensorPatch(out_tensor, 2, view_start, out_dims, out_strides, out_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out_data, out_dims, 2, out_strides,
ref_data, out_dims, 2, out_strides,
8, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out_data + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)ref_data + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&a_tensor));
VX_CALL(vxReleaseTensor(&b_tensor));
if (arg_->c_present) VX_CALL(vxReleaseTensor(&c_tensor));
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, a_tensor);
EXPECT_EQ_PTR(NULL, b_tensor);
EXPECT_EQ_PTR(NULL, c_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
ct_free_mem(a_data);
ct_free_mem(b_data);
ct_free_mem(c_data);
ct_free_mem(out_data);
}
}
TEST_WITH_ARG(TensorOp, testvxuTensorMatrixMultiply, test_tensor_matrix_multiply_op_arg,
TT_TENSOR_MAD_ALL()
)
{
const vx_context context = context_->vx_context_;
vx_size max_dims = 0;
{ // TODO: ownTestGetMaxDims() ?
VX_CALL(vxQueryContext(context_->vx_context_, VX_CONTEXT_MAX_TENSOR_DIMS, &max_dims, sizeof(max_dims)));
ASSERT(max_dims > 3);
if(!DEBUG_TEST_TENSOR_BEYOND_FOUR_DIMS) max_dims = 4; else max_dims = MIN(max_dims, MAX_TENSOR_DIMS);
}
uint64_t rng;
{ // TODO: ownTestGetRNG() ?
uint64_t * seed = &CT()->seed_;
ASSERT(!!seed);
CT_RNG_INIT(rng, *seed);
}
vx_enum data_type = 0;
vx_uint8 fixed_point_position= 0;
vx_size sizeof_data_type = 0;
ownUnpackFormat(arg_->fmt, &data_type, &fixed_point_position, &sizeof_data_type);
for (int iter = 0; iter < TEST_TENSOR_NUM_ITERATIONS; ++iter)
{
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("iter #: %d\n", iter); fflush(stdout);
}
const vx_size m = (vx_size)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
const vx_size n = (vx_size)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
const vx_size k = (vx_size)CT_RNG_NEXT_INT(rng, TEST_TENSOR_MIN_DIM_SZ, TEST_TENSOR_MAX_DIM_SZ+1);
// Not that unlike common GEMM, here we do not update an existing c but
// output to a different tensor!
const vx_size a_dims[2] = { arg_->a_transposed ? m : n, arg_->a_transposed ? n : m };
const vx_size b_dims[2] = { arg_->b_transposed ? n : k, arg_->b_transposed ? k : n };
const vx_size c_dims[2] = { arg_->c_transposed ? m : k, arg_->c_transposed ? k : m };
const vx_size out_dims[2] = { k, m };
if (DEBUG_TEST_TENSOR_ENABLE_PRINTF)
{
printf("\tconfig: {\n");
printf("\t a_dims: { %zu, %zu },\n", a_dims[0], a_dims[1]);
printf("\t b_dims: { %zu, %zu },\n", b_dims[0], b_dims[1]);
if (arg_->c_present)
printf("\t c_dims: { %zu, %zu },\n", c_dims[0], c_dims[1]);
printf("\t out_dims: { %zu, %zu },\n", out_dims[0], out_dims[1]);
printf("\t }\n");
}
const vx_size a_strides[2] = { sizeof_data_type, sizeof_data_type * a_dims[0] };
const vx_size b_strides[2] = { sizeof_data_type, sizeof_data_type * b_dims[0] };
const vx_size c_strides[2] = { sizeof_data_type, sizeof_data_type * c_dims[0] };
const vx_size out_strides[2] = { sizeof_data_type, sizeof_data_type * out_dims[0] };
void * a_data = ct_alloc_mem(m * n * sizeof_data_type);
void * b_data = ct_alloc_mem(n * k * sizeof_data_type);
void * c_data = arg_->c_present ? ct_alloc_mem(m * k * sizeof_data_type) : NULL;
void * out_data = ct_alloc_mem(m * k * sizeof_data_type);
void * ref_data = ct_alloc_mem(m * k * sizeof_data_type);
ASSERT(a_data && b_data && (!arg_->c_present || c_data) && out_data && ref_data);
// Since we check the sum of products here, and te accumulator is only
// supposed to be 32 bits, we need smaller values so that the intermidiate
// results don't exceed it.
ownFillSmallRandData(arg_->fmt, &rng, m * n, a_dims[0] + 1, a_data);
ownFillSmallRandData(arg_->fmt, &rng, n * k, a_dims[0] + 1, b_data);
if (arg_->c_present) { ownFillSmallRandData(arg_->fmt, &rng, m * k, a_dims[0] + 1, c_data); }
vx_tensor a_tensor = vxCreateTensor(context_->vx_context_, 2, a_dims, data_type, fixed_point_position);
vx_tensor b_tensor = vxCreateTensor(context_->vx_context_, 2, b_dims, data_type, fixed_point_position);
vx_tensor c_tensor = arg_->c_present ? vxCreateTensor(context_->vx_context_, 2, c_dims, data_type, fixed_point_position) : NULL;
vx_tensor out_tensor = vxCreateTensor(context_->vx_context_, 2, out_dims, data_type, fixed_point_position);
ASSERT_VX_OBJECT(a_tensor, VX_TYPE_TENSOR);
ASSERT_VX_OBJECT(b_tensor, VX_TYPE_TENSOR);
if (arg_->c_present)
{
ASSERT_VX_OBJECT(c_tensor, VX_TYPE_TENSOR);
}
ASSERT_VX_OBJECT(out_tensor, VX_TYPE_TENSOR);
vx_size view_start[2] = { 0, 0 };
VX_CALL(vxCopyTensorPatch(a_tensor, 2, view_start, a_dims, a_strides, a_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
VX_CALL(vxCopyTensorPatch(b_tensor, 2, view_start, b_dims, b_strides, b_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
if (arg_->c_present)
{
VX_CALL(vxCopyTensorPatch(c_tensor, 2, view_start, c_dims, c_strides, c_data, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
}
// Create, run vxuTensorMatrixMultiply
{
vx_tensor_matrix_multiply_params_t params = { arg_->a_transposed, arg_->b_transposed, arg_->c_transposed };
VX_CALL(vxuTensorMatrixMultiply(context, a_tensor, b_tensor, c_tensor, &params, out_tensor));
}
{
ownTensorMatrixMultiply(
arg_->fmt,
a_data, a_dims, a_strides, arg_->a_transposed,
b_data, b_dims, b_strides, arg_->b_transposed,
c_data, c_dims, c_strides, arg_->c_transposed,
ref_data, out_dims, out_strides);
VX_CALL(vxCopyTensorPatch(out_tensor, 2, view_start, out_dims, out_strides, out_data, VX_READ_ONLY, VX_MEMORY_TYPE_HOST));
size_t first_diff_index;
size_t first_diff_byte_offset0;
size_t first_diff_byte_offset1;
if (!ownExpectIdenticalData(
arg_->fmt,
out_data, out_dims, 2, out_strides,
ref_data, out_dims, 2, out_strides,
8, //(arg_->fmt == TT_Q78 ? 1 : 0),
&first_diff_index,
&first_diff_byte_offset0,
&first_diff_byte_offset1))
{
printf("DIFF! { idx: %zu, out: ", first_diff_index);
ownPrettyPrintVal(arg_->fmt, (char*)out_data + first_diff_byte_offset0);
printf(", ref: ");
ownPrettyPrintVal(arg_->fmt, (char*)ref_data + first_diff_byte_offset1);
printf(" }\n");
if (!DEBUG_TEST_TENSOR_CONTINUE_AFTER_ERROR) ASSERT(0);
}
}
VX_CALL(vxReleaseTensor(&a_tensor));
VX_CALL(vxReleaseTensor(&b_tensor));
if (arg_->c_present) VX_CALL(vxReleaseTensor(&c_tensor));
VX_CALL(vxReleaseTensor(&out_tensor));
EXPECT_EQ_PTR(NULL, a_tensor);
EXPECT_EQ_PTR(NULL, b_tensor);
EXPECT_EQ_PTR(NULL, c_tensor);
EXPECT_EQ_PTR(NULL, out_tensor);
ct_free_mem(a_data);
ct_free_mem(b_data);
ct_free_mem(c_data);
ct_free_mem(out_data);
}
}
TESTCASE_TESTS(TensorOp,
/* vx_nodes.h function tests */
testvxTensorElementwiseOp,
testvxuTensorElementwiseOp,
testvxTensorTableLookup,
testvxuTensorTableLookup,
testvxTensorTranspose,
testvxuTensorTranspose,
testvxTensorConvertDepth,
testvxuTensorConvertDepth,
testvxTensorMatrixMultiply,
testvxuTensorMatrixMultiply
/* minigraph tests */
/*, testTensorOpSanity*/
)
#endif //OPENVX_USE_ENHANCED_VISION