embdrv/sbc/decoder/srce/synthesis-dct8.c - third_party/android/platform/system/bt - Git at Google

 /******************************************************************************
  *
  *  Copyright 2014 The Android Open Source Project
  *  Copyright 2003 - 2004 Open Interface North America, Inc. All rights
  *                        reserved.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at:
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  *  Unless required by applicable law or agreed to in writing, software
  *  distributed under the License is distributed on an "AS IS" BASIS,
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  *
  ******************************************************************************/

 /*******************************************************************************
   $Revision: #1 $
  ******************************************************************************/

 /** @file
 @ingroup codec_internal
 */

 /**@addgroup codec_internal*/
 /**@{*/

 /*
  * Performs an 8-point Type-II scaled DCT using the Arai-Agui-Nakajima
  * factorization. The scaling factors are folded into the windowing
  * constants. 29 adds and 5 16x32 multiplies per 8 samples.
  */

 #include "oi_codec_sbc_private.h"

 #define AAN_C4_FIX (759250125) /* S1.30  759250125   0.707107*/

 #define AAN_C6_FIX (410903207) /* S1.30  410903207   0.382683*/

 #define AAN_Q0_FIX (581104888) /* S1.30  581104888   0.541196*/

 #define AAN_Q1_FIX (1402911301) /* S1.30 1402911301   1.306563*/

 /** Scales x by y bits to the right, adding a rounding factor.
  */
 #ifndef SCALE
 #define SCALE(x, y) (((x) + (1 << ((y)-1))) >> (y))
 #endif

 /**
  * Default C language implementation of a 32x32->32 multiply. This function may
  * be replaced by a platform-specific version for speed.
  *
  * @param u A signed 32-bit multiplicand
  * @param v A signed 32-bit multiplier

  * @return  A signed 32-bit value corresponding to the 32 most significant bits
  * of the 64-bit product of u and v.
  */
 INLINE int32_t default_mul_32s_32s_hi(int32_t u, int32_t v) {
   uint32_t u0, v0;
   int32_t u1, v1, w1, w2, t;

   u0 = u & 0xFFFF;
   u1 = u >> 16;
   v0 = v & 0xFFFF;
   v1 = v >> 16;
   t = u0 * v0;
   t = u1 * v0 + ((uint32_t)t >> 16);
   w1 = t & 0xFFFF;
   w2 = t >> 16;
   w1 = u0 * v1 + w1;
   return u1 * v1 + w2 + (w1 >> 16);
 }

 #define MUL_32S_32S_HI(_x, _y) default_mul_32s_32s_hi(_x, _y)

 #ifdef DEBUG_DCT
 PRIVATE void float_dct2_8(float* RESTRICT out, int32_t const* RESTRICT in) {
 #define FIX(x, bits) \
   (((int)floor(0.5f + ((x) * ((float)(1 << bits))))) / ((float)(1 << bits)))
 #define FLOAT_BUTTERFLY(x, y) \
   x += y;                     \
   y = x - (y * 2);            \
   OI_ASSERT(VALID_INT32(x));  \
   OI_ASSERT(VALID_INT32(y));
 #define FLOAT_MULT_DCT(K, sample) (FIX(K, 20) * sample)
 #define FLOAT_SCALE(x, y) (((x) / (double)(1 << (y))))

   double L00, L01, L02, L03, L04, L05, L06, L07;
   double L25;

   double in0, in1, in2, in3;
   double in4, in5, in6, in7;

   in0 = FLOAT_SCALE(in[0], DCTII_8_SHIFT_IN);
   OI_ASSERT(VALID_INT32(in0));
   in1 = FLOAT_SCALE(in[1], DCTII_8_SHIFT_IN);
   OI_ASSERT(VALID_INT32(in1));
   in2 = FLOAT_SCALE(in[2], DCTII_8_SHIFT_IN);
   OI_ASSERT(VALID_INT32(in2));
   in3 = FLOAT_SCALE(in[3], DCTII_8_SHIFT_IN);
   OI_ASSERT(VALID_INT32(in3));
   in4 = FLOAT_SCALE(in[4], DCTII_8_SHIFT_IN);
   OI_ASSERT(VALID_INT32(in4));
   in5 = FLOAT_SCALE(in[5], DCTII_8_SHIFT_IN);
   OI_ASSERT(VALID_INT32(in5));
   in6 = FLOAT_SCALE(in[6], DCTII_8_SHIFT_IN);
   OI_ASSERT(VALID_INT32(in6));
   in7 = FLOAT_SCALE(in[7], DCTII_8_SHIFT_IN);
   OI_ASSERT(VALID_INT32(in7));

   L00 = (in0 + in7);
   OI_ASSERT(VALID_INT32(L00));
   L01 = (in1 + in6);
   OI_ASSERT(VALID_INT32(L01));
   L02 = (in2 + in5);
   OI_ASSERT(VALID_INT32(L02));
   L03 = (in3 + in4);
   OI_ASSERT(VALID_INT32(L03));

   L04 = (in3 - in4);
   OI_ASSERT(VALID_INT32(L04));
   L05 = (in2 - in5);
   OI_ASSERT(VALID_INT32(L05));
   L06 = (in1 - in6);
   OI_ASSERT(VALID_INT32(L06));
   L07 = (in0 - in7);
   OI_ASSERT(VALID_INT32(L07));

   FLOAT_BUTTERFLY(L00, L03);
   FLOAT_BUTTERFLY(L01, L02);

   L02 += L03;
   OI_ASSERT(VALID_INT32(L02));

   L02 = FLOAT_MULT_DCT(AAN_C4_FLOAT, L02);
   OI_ASSERT(VALID_INT32(L02));

   FLOAT_BUTTERFLY(L00, L01);

   out[0] = (float)FLOAT_SCALE(L00, DCTII_8_SHIFT_0);
   OI_ASSERT(VALID_INT16(out[0]));
   out[4] = (float)FLOAT_SCALE(L01, DCTII_8_SHIFT_4);
   OI_ASSERT(VALID_INT16(out[4]));

   FLOAT_BUTTERFLY(L03, L02);
   out[6] = (float)FLOAT_SCALE(L02, DCTII_8_SHIFT_6);
   OI_ASSERT(VALID_INT16(out[6]));
   out[2] = (float)FLOAT_SCALE(L03, DCTII_8_SHIFT_2);
   OI_ASSERT(VALID_INT16(out[2]));

   L04 += L05;
   OI_ASSERT(VALID_INT32(L04));
   L05 += L06;
   OI_ASSERT(VALID_INT32(L05));
   L06 += L07;
   OI_ASSERT(VALID_INT32(L06));

   L04 /= 2;
   L05 /= 2;
   L06 /= 2;
   L07 /= 2;

   L05 = FLOAT_MULT_DCT(AAN_C4_FLOAT, L05);
   OI_ASSERT(VALID_INT32(L05));

   L25 = L06 - L04;
   OI_ASSERT(VALID_INT32(L25));
   L25 = FLOAT_MULT_DCT(AAN_C6_FLOAT, L25);
   OI_ASSERT(VALID_INT32(L25));

   L04 = FLOAT_MULT_DCT(AAN_Q0_FLOAT, L04);
   OI_ASSERT(VALID_INT32(L04));
   L04 -= L25;
   OI_ASSERT(VALID_INT32(L04));

   L06 = FLOAT_MULT_DCT(AAN_Q1_FLOAT, L06);
   OI_ASSERT(VALID_INT32(L06));
   L06 -= L25;
   OI_ASSERT(VALID_INT32(L25));

   FLOAT_BUTTERFLY(L07, L05);

   FLOAT_BUTTERFLY(L05, L04);
   out[3] = (float)(FLOAT_SCALE(L04, DCTII_8_SHIFT_3 - 1));
   OI_ASSERT(VALID_INT16(out[3]));
   out[5] = (float)(FLOAT_SCALE(L05, DCTII_8_SHIFT_5 - 1));
   OI_ASSERT(VALID_INT16(out[5]));

   FLOAT_BUTTERFLY(L07, L06);
   out[7] = (float)(FLOAT_SCALE(L06, DCTII_8_SHIFT_7 - 1));
   OI_ASSERT(VALID_INT16(out[7]));
   out[1] = (float)(FLOAT_SCALE(L07, DCTII_8_SHIFT_1 - 1));
   OI_ASSERT(VALID_INT16(out[1]));
 }
 #undef BUTTERFLY
 #endif

 /*
  * This function calculates the AAN DCT. Its inputs are in S16.15 format, as
  * returned by OI_SBC_Dequant. In practice, abs(in[x]) < 52429.0 / 1.38
  * (1244918057 integer). The function it computes is an approximation to the
  * array defined by:
  *
  * diag(aan_s) * AAN= C2
  *
  *   or
  *
  * AAN = diag(1/aan_s) * C2
  *
  * where C2 is as it is defined in the comment at the head of this file, and
  *
  * aan_s[i] = aan_s = 1/(2*cos(i*pi/16)) with i = 1..7, aan_s[0] = 1;
  *
  * aan_s[i] = [ 1.000  0.510  0.541  0.601  0.707  0.900  1.307  2.563 ]
  *
  * The output ranges are shown as follows:
  *
  * Let Y[0..7] = AAN * X[0..7]
  *
  * Without loss of generality, assume the input vector X consists of elements
  * between -1 and 1. The maximum possible value of a given output element occurs
  * with some particular combination of input vector elements each of which is -1
  * or 1. Consider the computation of Y[i]. Y[i] = sum t=0..7 of AAN[t,i]*X[i]. Y
  * is maximized if the sign of X[i] matches the sign of AAN[t,i], ensuring a
  * positive contribution to the sum. Equivalently, one may simply sum
  * abs(AAN)[t,i] over t to get the maximum possible value of Y[i].
  *
  * This yields approximately:
  *  [8.00  10.05   9.66   8.52   8.00   5.70   4.00   2.00]
  *
  * Given the maximum magnitude sensible input value of +/-37992, this yields the
  * following vector of maximum output magnitudes:
  *
  * [ 303936  381820  367003  323692  303936  216555  151968   75984 ]
  *
  * Ultimately, these values must fit into 16 bit signed integers, so they must
  * be scaled. A non-uniform scaling helps maximize the kept precision. The
  * relative number of extra bits of precision maintainable with respect to the
  * largest value is given here:
  *
  * [ 0  0  0  0  0  0  1  2 ]
  *
  */
 PRIVATE void dct2_8(SBC_BUFFER_T* RESTRICT out, int32_t const* RESTRICT in) {
 #define BUTTERFLY(x, y) \
   x += (y);             \
   (y) = (x) - ((y) << 1);
 #define FIX_MULT_DCT(K, x) (MUL_32S_32S_HI(K, x) << 2)

   int32_t L00, L01, L02, L03, L04, L05, L06, L07;
   int32_t L25;

   int32_t in0, in1, in2, in3;
   int32_t in4, in5, in6, in7;

 #if DCTII_8_SHIFT_IN != 0
   in0 = SCALE(in[0], DCTII_8_SHIFT_IN);
   in1 = SCALE(in[1], DCTII_8_SHIFT_IN);
   in2 = SCALE(in[2], DCTII_8_SHIFT_IN);
   in3 = SCALE(in[3], DCTII_8_SHIFT_IN);
   in4 = SCALE(in[4], DCTII_8_SHIFT_IN);
   in5 = SCALE(in[5], DCTII_8_SHIFT_IN);
   in6 = SCALE(in[6], DCTII_8_SHIFT_IN);
   in7 = SCALE(in[7], DCTII_8_SHIFT_IN);
 #else
   in0 = in[0];
   in1 = in[1];
   in2 = in[2];
   in3 = in[3];
   in4 = in[4];
   in5 = in[5];
   in6 = in[6];
   in7 = in[7];
 #endif

   L00 = in0 + in7;
   L01 = in1 + in6;
   L02 = in2 + in5;
   L03 = in3 + in4;

   L04 = in3 - in4;
   L05 = in2 - in5;
   L06 = in1 - in6;
   L07 = in0 - in7;

   BUTTERFLY(L00, L03);
   BUTTERFLY(L01, L02);

   L02 += L03;

   L02 = FIX_MULT_DCT(AAN_C4_FIX, L02);

   BUTTERFLY(L00, L01);

   out[0] = (int16_t)SCALE(L00, DCTII_8_SHIFT_0);
   out[4] = (int16_t)SCALE(L01, DCTII_8_SHIFT_4);

   BUTTERFLY(L03, L02);
   out[6] = (int16_t)SCALE(L02, DCTII_8_SHIFT_6);
   out[2] = (int16_t)SCALE(L03, DCTII_8_SHIFT_2);

   L04 += L05;
   L05 += L06;
   L06 += L07;

   L04 /= 2;
   L05 /= 2;
   L06 /= 2;
   L07 /= 2;

   L05 = FIX_MULT_DCT(AAN_C4_FIX, L05);

   L25 = L06 - L04;
   L25 = FIX_MULT_DCT(AAN_C6_FIX, L25);

   L04 = FIX_MULT_DCT(AAN_Q0_FIX, L04);
   L04 -= L25;

   L06 = FIX_MULT_DCT(AAN_Q1_FIX, L06);
   L06 -= L25;

   BUTTERFLY(L07, L05);

   BUTTERFLY(L05, L04);
   out[3] = (int16_t)SCALE(L04, DCTII_8_SHIFT_3 - 1);
   out[5] = (int16_t)SCALE(L05, DCTII_8_SHIFT_5 - 1);

   BUTTERFLY(L07, L06);
   out[7] = (int16_t)SCALE(L06, DCTII_8_SHIFT_7 - 1);
   out[1] = (int16_t)SCALE(L07, DCTII_8_SHIFT_1 - 1);
 #undef BUTTERFLY

 #ifdef DEBUG_DCT
   {
     float float_out[8];
     float_dct2_8(float_out, in);
   }
 #endif
 }

 /**@}*/
	/******************************************************************************
	*
	* Copyright 2014 The Android Open Source Project
	* Copyright 2003 - 2004 Open Interface North America, Inc. All rights
	* reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at:
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	******************************************************************************/

	/*******************************************************************************
	$Revision: #1 $
	******************************************************************************/

	/** @file
	@ingroup codec_internal
	*/

	/*@addgroup codec_internal/
	/*@{/

	/*
	* Performs an 8-point Type-II scaled DCT using the Arai-Agui-Nakajima
	* factorization. The scaling factors are folded into the windowing
	* constants. 29 adds and 5 16x32 multiplies per 8 samples.
	*/

	#include "oi_codec_sbc_private.h"

	#define AAN_C4_FIX (759250125) /* S1.30 759250125 0.707107*/

	#define AAN_C6_FIX (410903207) /* S1.30 410903207 0.382683*/

	#define AAN_Q0_FIX (581104888) /* S1.30 581104888 0.541196*/

	#define AAN_Q1_FIX (1402911301) /* S1.30 1402911301 1.306563*/

	/** Scales x by y bits to the right, adding a rounding factor.
	*/
	#ifndef SCALE
	#define SCALE(x, y) (((x) + (1 << ((y)-1))) >> (y))
	#endif

	/**
	* Default C language implementation of a 32x32->32 multiply. This function may
	* be replaced by a platform-specific version for speed.
	*
	* @param u A signed 32-bit multiplicand
	* @param v A signed 32-bit multiplier

	* @return A signed 32-bit value corresponding to the 32 most significant bits
	* of the 64-bit product of u and v.
	*/
	INLINE int32_t default_mul_32s_32s_hi(int32_t u, int32_t v) {
	uint32_t u0, v0;
	int32_t u1, v1, w1, w2, t;

	u0 = u & 0xFFFF;
	u1 = u >> 16;
	v0 = v & 0xFFFF;
	v1 = v >> 16;
	t = u0 * v0;
	t = u1 * v0 + ((uint32_t)t >> 16);
	w1 = t & 0xFFFF;
	w2 = t >> 16;
	w1 = u0 * v1 + w1;
	return u1 * v1 + w2 + (w1 >> 16);
	}

	#define MUL_32S_32S_HI(_x, _y) default_mul_32s_32s_hi(_x, _y)

	#ifdef DEBUG_DCT
	PRIVATE void float_dct2_8(float* RESTRICT out, int32_t const* RESTRICT in) {
	#define FIX(x, bits) \
	(((int)floor(0.5f + ((x) * ((float)(1 << bits))))) / ((float)(1 << bits)))
	#define FLOAT_BUTTERFLY(x, y) \
	x += y; \
	y = x - (y * 2); \
	OI_ASSERT(VALID_INT32(x)); \
	OI_ASSERT(VALID_INT32(y));
	#define FLOAT_MULT_DCT(K, sample) (FIX(K, 20) * sample)
	#define FLOAT_SCALE(x, y) (((x) / (double)(1 << (y))))

	double L00, L01, L02, L03, L04, L05, L06, L07;
	double L25;

	double in0, in1, in2, in3;
	double in4, in5, in6, in7;

	in0 = FLOAT_SCALE(in[0], DCTII_8_SHIFT_IN);
	OI_ASSERT(VALID_INT32(in0));
	in1 = FLOAT_SCALE(in[1], DCTII_8_SHIFT_IN);
	OI_ASSERT(VALID_INT32(in1));
	in2 = FLOAT_SCALE(in[2], DCTII_8_SHIFT_IN);
	OI_ASSERT(VALID_INT32(in2));
	in3 = FLOAT_SCALE(in[3], DCTII_8_SHIFT_IN);
	OI_ASSERT(VALID_INT32(in3));
	in4 = FLOAT_SCALE(in[4], DCTII_8_SHIFT_IN);
	OI_ASSERT(VALID_INT32(in4));
	in5 = FLOAT_SCALE(in[5], DCTII_8_SHIFT_IN);
	OI_ASSERT(VALID_INT32(in5));
	in6 = FLOAT_SCALE(in[6], DCTII_8_SHIFT_IN);
	OI_ASSERT(VALID_INT32(in6));
	in7 = FLOAT_SCALE(in[7], DCTII_8_SHIFT_IN);
	OI_ASSERT(VALID_INT32(in7));

	L00 = (in0 + in7);
	OI_ASSERT(VALID_INT32(L00));
	L01 = (in1 + in6);
	OI_ASSERT(VALID_INT32(L01));
	L02 = (in2 + in5);
	OI_ASSERT(VALID_INT32(L02));
	L03 = (in3 + in4);
	OI_ASSERT(VALID_INT32(L03));

	L04 = (in3 - in4);
	OI_ASSERT(VALID_INT32(L04));
	L05 = (in2 - in5);
	OI_ASSERT(VALID_INT32(L05));
	L06 = (in1 - in6);
	OI_ASSERT(VALID_INT32(L06));
	L07 = (in0 - in7);
	OI_ASSERT(VALID_INT32(L07));

	FLOAT_BUTTERFLY(L00, L03);
	FLOAT_BUTTERFLY(L01, L02);

	L02 += L03;
	OI_ASSERT(VALID_INT32(L02));

	L02 = FLOAT_MULT_DCT(AAN_C4_FLOAT, L02);
	OI_ASSERT(VALID_INT32(L02));

	FLOAT_BUTTERFLY(L00, L01);

	out[0] = (float)FLOAT_SCALE(L00, DCTII_8_SHIFT_0);
	OI_ASSERT(VALID_INT16(out[0]));
	out[4] = (float)FLOAT_SCALE(L01, DCTII_8_SHIFT_4);
	OI_ASSERT(VALID_INT16(out[4]));

	FLOAT_BUTTERFLY(L03, L02);
	out[6] = (float)FLOAT_SCALE(L02, DCTII_8_SHIFT_6);
	OI_ASSERT(VALID_INT16(out[6]));
	out[2] = (float)FLOAT_SCALE(L03, DCTII_8_SHIFT_2);
	OI_ASSERT(VALID_INT16(out[2]));

	L04 += L05;
	OI_ASSERT(VALID_INT32(L04));
	L05 += L06;
	OI_ASSERT(VALID_INT32(L05));
	L06 += L07;
	OI_ASSERT(VALID_INT32(L06));

	L04 /= 2;
	L05 /= 2;
	L06 /= 2;
	L07 /= 2;

	L05 = FLOAT_MULT_DCT(AAN_C4_FLOAT, L05);
	OI_ASSERT(VALID_INT32(L05));

	L25 = L06 - L04;
	OI_ASSERT(VALID_INT32(L25));
	L25 = FLOAT_MULT_DCT(AAN_C6_FLOAT, L25);
	OI_ASSERT(VALID_INT32(L25));

	L04 = FLOAT_MULT_DCT(AAN_Q0_FLOAT, L04);
	OI_ASSERT(VALID_INT32(L04));
	L04 -= L25;
	OI_ASSERT(VALID_INT32(L04));

	L06 = FLOAT_MULT_DCT(AAN_Q1_FLOAT, L06);
	OI_ASSERT(VALID_INT32(L06));
	L06 -= L25;
	OI_ASSERT(VALID_INT32(L25));

	FLOAT_BUTTERFLY(L07, L05);

	FLOAT_BUTTERFLY(L05, L04);
	out[3] = (float)(FLOAT_SCALE(L04, DCTII_8_SHIFT_3 - 1));
	OI_ASSERT(VALID_INT16(out[3]));
	out[5] = (float)(FLOAT_SCALE(L05, DCTII_8_SHIFT_5 - 1));
	OI_ASSERT(VALID_INT16(out[5]));

	FLOAT_BUTTERFLY(L07, L06);
	out[7] = (float)(FLOAT_SCALE(L06, DCTII_8_SHIFT_7 - 1));
	OI_ASSERT(VALID_INT16(out[7]));
	out[1] = (float)(FLOAT_SCALE(L07, DCTII_8_SHIFT_1 - 1));
	OI_ASSERT(VALID_INT16(out[1]));
	}
	#undef BUTTERFLY
	#endif

	/*
	* This function calculates the AAN DCT. Its inputs are in S16.15 format, as
	* returned by OI_SBC_Dequant. In practice, abs(in[x]) < 52429.0 / 1.38
	* (1244918057 integer). The function it computes is an approximation to the
	* array defined by:
	*
	* diag(aan_s) * AAN= C2
	*
	* or
	*
	* AAN = diag(1/aan_s) * C2
	*
	* where C2 is as it is defined in the comment at the head of this file, and
	*
	* aan_s[i] = aan_s = 1/(2cos(ipi/16)) with i = 1..7, aan_s[0] = 1;
	*
	* aan_s[i] = [ 1.000 0.510 0.541 0.601 0.707 0.900 1.307 2.563 ]
	*
	* The output ranges are shown as follows:
	*
	* Let Y[0..7] = AAN * X[0..7]
	*
	* Without loss of generality, assume the input vector X consists of elements
	* between -1 and 1. The maximum possible value of a given output element occurs
	* with some particular combination of input vector elements each of which is -1
	* or 1. Consider the computation of Y[i]. Y[i] = sum t=0..7 of AAN[t,i]*X[i]. Y
	* is maximized if the sign of X[i] matches the sign of AAN[t,i], ensuring a
	* positive contribution to the sum. Equivalently, one may simply sum
	* abs(AAN)[t,i] over t to get the maximum possible value of Y[i].
	*
	* This yields approximately:
	* [8.00 10.05 9.66 8.52 8.00 5.70 4.00 2.00]
	*
	* Given the maximum magnitude sensible input value of +/-37992, this yields the
	* following vector of maximum output magnitudes:
	*
	* [ 303936 381820 367003 323692 303936 216555 151968 75984 ]
	*
	* Ultimately, these values must fit into 16 bit signed integers, so they must
	* be scaled. A non-uniform scaling helps maximize the kept precision. The
	* relative number of extra bits of precision maintainable with respect to the
	* largest value is given here:
	*
	* [ 0 0 0 0 0 0 1 2 ]
	*
	*/
	PRIVATE void dct2_8(SBC_BUFFER_T* RESTRICT out, int32_t const* RESTRICT in) {
	#define BUTTERFLY(x, y) \
	x += (y); \
	(y) = (x) - ((y) << 1);
	#define FIX_MULT_DCT(K, x) (MUL_32S_32S_HI(K, x) << 2)

	int32_t L00, L01, L02, L03, L04, L05, L06, L07;
	int32_t L25;

	int32_t in0, in1, in2, in3;
	int32_t in4, in5, in6, in7;

	#if DCTII_8_SHIFT_IN != 0
	in0 = SCALE(in[0], DCTII_8_SHIFT_IN);
	in1 = SCALE(in[1], DCTII_8_SHIFT_IN);
	in2 = SCALE(in[2], DCTII_8_SHIFT_IN);
	in3 = SCALE(in[3], DCTII_8_SHIFT_IN);
	in4 = SCALE(in[4], DCTII_8_SHIFT_IN);
	in5 = SCALE(in[5], DCTII_8_SHIFT_IN);
	in6 = SCALE(in[6], DCTII_8_SHIFT_IN);
	in7 = SCALE(in[7], DCTII_8_SHIFT_IN);
	#else
	in0 = in[0];
	in1 = in[1];
	in2 = in[2];
	in3 = in[3];
	in4 = in[4];
	in5 = in[5];
	in6 = in[6];
	in7 = in[7];
	#endif

	L00 = in0 + in7;
	L01 = in1 + in6;
	L02 = in2 + in5;
	L03 = in3 + in4;

	L04 = in3 - in4;
	L05 = in2 - in5;
	L06 = in1 - in6;
	L07 = in0 - in7;

	BUTTERFLY(L00, L03);
	BUTTERFLY(L01, L02);

	L02 += L03;

	L02 = FIX_MULT_DCT(AAN_C4_FIX, L02);

	BUTTERFLY(L00, L01);

	out[0] = (int16_t)SCALE(L00, DCTII_8_SHIFT_0);
	out[4] = (int16_t)SCALE(L01, DCTII_8_SHIFT_4);

	BUTTERFLY(L03, L02);
	out[6] = (int16_t)SCALE(L02, DCTII_8_SHIFT_6);
	out[2] = (int16_t)SCALE(L03, DCTII_8_SHIFT_2);

	L04 += L05;
	L05 += L06;
	L06 += L07;

	L04 /= 2;
	L05 /= 2;
	L06 /= 2;
	L07 /= 2;

	L05 = FIX_MULT_DCT(AAN_C4_FIX, L05);

	L25 = L06 - L04;
	L25 = FIX_MULT_DCT(AAN_C6_FIX, L25);

	L04 = FIX_MULT_DCT(AAN_Q0_FIX, L04);
	L04 -= L25;

	L06 = FIX_MULT_DCT(AAN_Q1_FIX, L06);
	L06 -= L25;

	BUTTERFLY(L07, L05);

	BUTTERFLY(L05, L04);
	out[3] = (int16_t)SCALE(L04, DCTII_8_SHIFT_3 - 1);
	out[5] = (int16_t)SCALE(L05, DCTII_8_SHIFT_5 - 1);

	BUTTERFLY(L07, L06);
	out[7] = (int16_t)SCALE(L06, DCTII_8_SHIFT_7 - 1);
	out[1] = (int16_t)SCALE(L07, DCTII_8_SHIFT_1 - 1);
	#undef BUTTERFLY

	#ifdef DEBUG_DCT
	{
	float float_out[8];
	float_dct2_8(float_out, in);
	}
	#endif
	}

	/*@}/