test_conformance/Networks/src/precisionConverter.c - third_party/github.com/KhronosGroup/OpenVX-cts - Git at Google

 #include "precisionConverter.h"
 #include <limits.h>
 #include <memory.h>


 // F32: exp_bias:127 SEEEEEEE EMMMMMMM MMMMMMMM MMMMMMMM.
 // F16: exp_bias:15  SEEEEEMM MMMMMMMM
 #define EXP_MASK_F32 0x7F800000U
 #define EXP_MASK_F16     0x7C00U

 //small helper function to represent uint32_t value as float32
 float asfloat(uint32_t v)
 {
     return *(float*)&v;
 }


 /** @brief Converts FP32 to FP16 with rounding to nearest value to minimize error
 *    the denormal values are converted to 0.
 *  @param x - value in FP32 format
 *  @return value in FP16 format
 **************************************************************/
 uint16_t FP32ToFP16(float x)
 {
     //create minimal positive normal f16 value in f32 format
     //exp:-14,mantissa:0 -> 2^-14 * 1.0
     static uint32_t min16_i = (127 - 14) << 23;
     float min16 = asfloat(min16_i);

     //create maximal positive normal f16 value in f32 and f16 formats
     //exp:15,mantissa:11111 -> 2^15 * 1.(11111)
     static uint32_t max16_i = ((127 + 15) << 23) | 0x007FE000;
     float    max16 = asfloat(max16_i);
     static uint32_t max16f16 = ((15 + 15) << 10) | 0x3FF;

     // define and declare variable for intermidiate and output result
     // the union is used to simplify representation changing
     union
     {
         float f;
         uint32_t u;
     } v;
     v.f = x;

     // get sign in 16bit format
     uint32_t    s = (v.u >> 16) & 0x8000; // sign 16:  00000000 00000000 10000000 00000000

                                           // make it abs
     v.u &= 0x7FFFFFFF;                    // abs mask: 01111111 11111111 11111111 11111111

                                           // check NAN and INF
     if ((v.u & EXP_MASK_F32) == EXP_MASK_F32)
     {
         if (v.u & 0x007FFFFF)
             return s | (v.u >> (23 - 10)) | 0x0200; // return NAN f16
         else
             return s | (v.u >> (23 - 10)); // return INF f16
     }

     // to make f32 round to nearest f16
     // create halfULP for f16 and add it to origin value
     float halfULP = asfloat(v.u & EXP_MASK_F32) * asfloat((127 - 11) << 23);
     v.f += halfULP;

     // if input value is not fit normalized f16 then return 0
     // denormals are not covered by this code and just converted to 0
     if (v.f < min16*0.5F)
         return s;

     // if input value between min16/2 and min16 then return min16
     if (v.f < min16)
         return s | (1 << 10);

     // if input value more than maximal allowed value for f16
     // then return this maximal value
     if (v.f >= max16)
         return max16f16 | s;

     // change exp bias from 127 to 15
     v.u -= ((127 - 15) << 23);

     // round to f16
     v.u >>= (23 - 10);

     return v.u | s;
 }

 /** @brief Converts FP16 to FP32
 *  @param x - value in FP16 format
 *  @return value in FP32 format
 **************************************************************/
 float FP16ToFP32(uint16_t x)
 {
     // this is storage for output result
     uint32_t u = x;

     // get sign in 32bit format
     uint32_t s = ((u & 0x8000) << 16);

     // check for NAN and INF
     if ((u & EXP_MASK_F16) == EXP_MASK_F16)
     {
         //keep mantissa only
         u &= 0x03FF;

         // check if it is NAN and raise 10 bit to be align with intrin
         if (u)
             u |= 0x0200;

         u <<= (23 - 10);
         u |= EXP_MASK_F32;
         u |= s;
     }
     // check for zero and denormals. both are converted to zero
     else if ((x & EXP_MASK_F16) == 0)
     {
         u = s;
     }
     else
     {
         //abs
         u = (u & 0x7FFF);

         // shift mantissa and exp from f16 to f32 position
         u <<= (23 - 10);

         //new bias for exp (f16 bias is 15 and f32 bias is 127)
         u += ((127 - 15) << 23);

         //add sign
         u |= s;
     }

     //finaly represent result as float and return
     return asfloat(u);
 }

 /** @brief Converts S16 (signed int16) to a float
 *  @param s16Pixel - A pointer to a value in S16 format.
 *  @return float value
 ***************************************************************/
 float S16ToFloat(const char* s16Pixel)
 {
     int16_t value = *((int16_t*)s16Pixel);
     return (float)value;
 }

 /** @brief Converts Q78 to a float
 *  @param q78Pixel - A pointer to a value in Q78 format.
 *  @return float value
 **************************************************************/
 float Q78ToFloat(const char* q78Pixel)
 {
     int16_t value = *((int16_t*)q78Pixel);
     return ((float)value) / 256.0;
 }

 /** @brief Converts FP16 to a float
 *  @param fp16Pixel - A pointer to a value in FP16 format.
 *  @return float value
 **************************************************************/
 float FP16ToFloat(const char* fp16Pixel)
 {
     uint16_t value = *((uint16_t*)fp16Pixel);
     return FP16ToFP32(value);
 }

 /** @brief Converts FP32 to a float. It just copies the value from the input pointer.
 *  @param fp32Pixel - A pointer to a value in FP32 format.
 *  @return float value
 **************************************************************/
 float FP32ToFloat(const char* fp32Pixel)
 {
     return *((float*)fp32Pixel);
 }

 /** @brief Converts float to S16 format and copy it to the input pointer
 *  @param s16Pixel - A pointer where to copy the converted value
 *  @return void
 **************************************************************/
 void floatToS16(float floatValue, char* s16Pixel)
 {
     int16_t value = (int16_t)floatValue;
     memcpy(s16Pixel, &value, sizeof(int16_t));
 }

 /** @brief Converts float to Q78 format and copy it to the input pointer
 *  @param q78Pixel - A pointer where to copy the converted value
 *  @return void
 **************************************************************/
 void floatToQ78(float floatValue, char* q78Pixel)
 {
     float r = floatValue < 0.0 ? -0.5 : 0.5;
     int tmpValue = (int)((floatValue * 256.0 + r));
     int16_t value = tmpValue > SHRT_MAX ? SHRT_MAX : (tmpValue < SHRT_MIN ? SHRT_MIN : (int16_t)tmpValue);
     memcpy(q78Pixel, &value, sizeof(int16_t));
 }

 /** @brief Converts float to FP16 format and copy it to the input pointer
 *  @param fp16Pixel - A pointer where to copy the converted value
 *  @return void
 **************************************************************/
 void floatToFP16(float floatValue, char* fp16Pixel)
 {
     uint16_t value = FP32ToFP16(floatValue);
     memcpy(fp16Pixel, &value, sizeof(uint16_t));
 }

 /** @brief Converts float copy the float value to the input pointer
 *  @param fp32Pixel - A pointer where to copy the float value
 *  @return void
 **************************************************************/
 void floatToFP32(float floatValue, char* fp32Pixel)
 {
     memcpy(fp32Pixel, &floatValue, sizeof(float));
 }
	#include "precisionConverter.h"
	#include <limits.h>
	#include <memory.h>


	// F32: exp_bias:127 SEEEEEEE EMMMMMMM MMMMMMMM MMMMMMMM.
	// F16: exp_bias:15 SEEEEEMM MMMMMMMM
	#define EXP_MASK_F32 0x7F800000U
	#define EXP_MASK_F16 0x7C00U

	//small helper function to represent uint32_t value as float32
	float asfloat(uint32_t v)
	{
	return (float)&v;
	}


	/** @brief Converts FP32 to FP16 with rounding to nearest value to minimize error
	* the denormal values are converted to 0.
	* @param x - value in FP32 format
	* @return value in FP16 format
	**************************************************************/
	uint16_t FP32ToFP16(float x)
	{
	//create minimal positive normal f16 value in f32 format
	//exp:-14,mantissa:0 -> 2^-14 * 1.0
	static uint32_t min16_i = (127 - 14) << 23;
	float min16 = asfloat(min16_i);

	//create maximal positive normal f16 value in f32 and f16 formats
	//exp:15,mantissa:11111 -> 2^15 * 1.(11111)
	static uint32_t max16_i = ((127 + 15) << 23) \| 0x007FE000;
	float max16 = asfloat(max16_i);
	static uint32_t max16f16 = ((15 + 15) << 10) \| 0x3FF;

	// define and declare variable for intermidiate and output result
	// the union is used to simplify representation changing
	union
	{
	float f;
	uint32_t u;
	} v;
	v.f = x;

	// get sign in 16bit format
	uint32_t s = (v.u >> 16) & 0x8000; // sign 16: 00000000 00000000 10000000 00000000

	// make it abs
	v.u &= 0x7FFFFFFF; // abs mask: 01111111 11111111 11111111 11111111

	// check NAN and INF
	if ((v.u & EXP_MASK_F32) == EXP_MASK_F32)
	{
	if (v.u & 0x007FFFFF)
	return s \| (v.u >> (23 - 10)) \| 0x0200; // return NAN f16
	else
	return s \| (v.u >> (23 - 10)); // return INF f16
	}

	// to make f32 round to nearest f16
	// create halfULP for f16 and add it to origin value
	float halfULP = asfloat(v.u & EXP_MASK_F32) * asfloat((127 - 11) << 23);
	v.f += halfULP;

	// if input value is not fit normalized f16 then return 0
	// denormals are not covered by this code and just converted to 0
	if (v.f < min16*0.5F)
	return s;

	// if input value between min16/2 and min16 then return min16
	if (v.f < min16)
	return s \| (1 << 10);

	// if input value more than maximal allowed value for f16
	// then return this maximal value
	if (v.f >= max16)
	return max16f16 \| s;

	// change exp bias from 127 to 15
	v.u -= ((127 - 15) << 23);

	// round to f16
	v.u >>= (23 - 10);

	return v.u \| s;
	}

	/** @brief Converts FP16 to FP32
	* @param x - value in FP16 format
	* @return value in FP32 format
	**************************************************************/
	float FP16ToFP32(uint16_t x)
	{
	// this is storage for output result
	uint32_t u = x;

	// get sign in 32bit format
	uint32_t s = ((u & 0x8000) << 16);

	// check for NAN and INF
	if ((u & EXP_MASK_F16) == EXP_MASK_F16)
	{
	//keep mantissa only
	u &= 0x03FF;

	// check if it is NAN and raise 10 bit to be align with intrin
	if (u)
	u \|= 0x0200;

	u <<= (23 - 10);
	u \|= EXP_MASK_F32;
	u \|= s;
	}
	// check for zero and denormals. both are converted to zero
	else if ((x & EXP_MASK_F16) == 0)
	{
	u = s;
	}
	else
	{
	//abs
	u = (u & 0x7FFF);

	// shift mantissa and exp from f16 to f32 position
	u <<= (23 - 10);

	//new bias for exp (f16 bias is 15 and f32 bias is 127)
	u += ((127 - 15) << 23);

	//add sign
	u \|= s;
	}

	//finaly represent result as float and return
	return asfloat(u);
	}

	/** @brief Converts S16 (signed int16) to a float
	* @param s16Pixel - A pointer to a value in S16 format.
	* @return float value
	***************************************************************/
	float S16ToFloat(const char* s16Pixel)
	{
	int16_t value = ((int16_t)s16Pixel);
	return (float)value;
	}

	/** @brief Converts Q78 to a float
	* @param q78Pixel - A pointer to a value in Q78 format.
	* @return float value
	**************************************************************/
	float Q78ToFloat(const char* q78Pixel)
	{
	int16_t value = ((int16_t)q78Pixel);
	return ((float)value) / 256.0;
	}

	/** @brief Converts FP16 to a float
	* @param fp16Pixel - A pointer to a value in FP16 format.
	* @return float value
	**************************************************************/
	float FP16ToFloat(const char* fp16Pixel)
	{
	uint16_t value = ((uint16_t)fp16Pixel);
	return FP16ToFP32(value);
	}

	/** @brief Converts FP32 to a float. It just copies the value from the input pointer.
	* @param fp32Pixel - A pointer to a value in FP32 format.
	* @return float value
	**************************************************************/
	float FP32ToFloat(const char* fp32Pixel)
	{
	return ((float)fp32Pixel);
	}

	/** @brief Converts float to S16 format and copy it to the input pointer
	* @param s16Pixel - A pointer where to copy the converted value
	* @return void
	**************************************************************/
	void floatToS16(float floatValue, char* s16Pixel)
	{
	int16_t value = (int16_t)floatValue;
	memcpy(s16Pixel, &value, sizeof(int16_t));
	}

	/** @brief Converts float to Q78 format and copy it to the input pointer
	* @param q78Pixel - A pointer where to copy the converted value
	* @return void
	**************************************************************/
	void floatToQ78(float floatValue, char* q78Pixel)
	{
	float r = floatValue < 0.0 ? -0.5 : 0.5;
	int tmpValue = (int)((floatValue * 256.0 + r));
	int16_t value = tmpValue > SHRT_MAX ? SHRT_MAX : (tmpValue < SHRT_MIN ? SHRT_MIN : (int16_t)tmpValue);
	memcpy(q78Pixel, &value, sizeof(int16_t));
	}

	/** @brief Converts float to FP16 format and copy it to the input pointer
	* @param fp16Pixel - A pointer where to copy the converted value
	* @return void
	**************************************************************/
	void floatToFP16(float floatValue, char* fp16Pixel)
	{
	uint16_t value = FP32ToFP16(floatValue);
	memcpy(fp16Pixel, &value, sizeof(uint16_t));
	}

	/** @brief Converts float copy the float value to the input pointer
	* @param fp32Pixel - A pointer where to copy the float value
	* @return void
	**************************************************************/
	void floatToFP32(float floatValue, char* fp32Pixel)
	{
	memcpy(fp32Pixel, &floatValue, sizeof(float));
	}