jfdctflt.asm - third_party/libjpeg-turbo - Git at Google

 ;
 ; jfdctflt.asm - floating-point FDCT (non-SIMD)
 ;
 ; x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
 ;
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains a floating-point implementation of the forward DCT
 ; (Discrete Cosine Transform). The following code is based directly on
 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
 ;
 ; Last Modified : October 17, 2004
 ;
 ; [TAB8]

 %include "jsimdext.inc"
 %include "jdct.inc"

 %ifdef DCT_FLOAT_SUPPORTED

 ; This module is specialized to the case DCTSIZE = 8.
 ;
 %if DCTSIZE != 8
 %error "Sorry, this code only copes with 8x8 DCTs."
 %endif

 ; --------------------------------------------------------------------------
 	SECTION	SEG_CONST

 %define ROTATOR_TYPE	FP32	; float

 	alignz	16
 	global	EXTN(jconst_fdct_float)

 EXTN(jconst_fdct_float):

 F_0_382	dd	0.382683432365089771728460	; cos(PI*3/8)
 F_0_707	dd	0.707106781186547524400844	; cos(PI*1/4)
 F_0_541	dd	0.541196100146196984399723	; cos(PI*1/8)-cos(PI*3/8)
 F_1_306	dd	1.306562964876376527856643	; cos(PI*1/8)+cos(PI*3/8)

 	alignz	16

 ; --------------------------------------------------------------------------
 	SECTION	SEG_TEXT
 	BITS	32
 ;
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
 ; jpeg_fdct_float (FAST_FLOAT * data)
 ;

 %define data(b)	(b)+8		; FAST_FLOAT * data

 	align	16
 	global	EXTN(jpeg_fdct_float)

 EXTN(jpeg_fdct_float):
 	push	ebp
 	mov	ebp,esp
 	pushpic	ebx
 ;	push	ecx		; need not be preserved
 ;	push	edx		; need not be preserved
 ;	push	esi		; unused
 ;	push	edi		; unused

 	get_GOT	ebx		; get GOT address

 	; ---- Pass 1: process rows.

 	mov	edx, POINTER [data(ebp)]	; (FAST_FLOAT *)
 	mov	ecx, DCTSIZE
 	alignx	16,7
 .rowloop:
 	fld	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
 	fadd	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
 	fadd	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
 	fadd	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
 	fadd	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]

 	; -- Even part

 	fld	st2	; st2 = st2 + st1, st1 = st2 - st1
 	fsub	st0,st2
 	fxch	st0,st2
 	faddp	st3,st0
 	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
 	fsub	st0,st1
 	fxch	st0,st1
 	faddp	st4,st0

 	fadd	st0,st1
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]

 	fld	st2	; st3 = st2 + st3, st2 = st2 - st3
 	fsub	st0,st4
 	fxch	st0,st3
 	faddp	st4,st0
 	fld	st1	; st0 = st1 + st0, st1 = st1 - st0
 	fsub	st0,st1
 	fxch	st0,st2
 	faddp	st1,st0

 	fld	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
 	fsub	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st4
 	fld	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
 	fsub	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st4
 	fld	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
 	fsub	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st4
 	fld	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
 	fsub	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st4

 	fstp	FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]

 	; -- Odd part

 	fadd	st2,st0
 	fadd	st0,st1
 	fxch	st0,st3
 	fadd	st1,st0
 	fxch	st0,st3

 	fld	st2
 	fxch	st0,st1
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
 	fxch	st0,st1
 	fsub	st0,st2
 	fxch	st0,st3
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
 	fxch	st0,st3
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
 	fxch	st0,st2
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
 	fxch	st0,st2
 	fadd	st3,st0
 	faddp	st2,st0

 	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
 	fsub	st0,st1
 	fxch	st0,st1
 	faddp	st4,st0

 	fld	st2	; st0 = st0 + st2, st2 = st0 - st2
 	fsubr	st0,st1
 	fxch	st0,st3
 	faddp	st1,st0
 	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
 	fsubr	st0,st4
 	fxch	st0,st2
 	faddp	st4,st0

 	fstp	FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]

 	add	edx, byte DCTSIZE*SIZEOF_FAST_FLOAT
 	dec	ecx				; advance pointer to next row
 	jnz	near .rowloop

 	; ---- Pass 2: process columns.

 	mov	edx, POINTER [data(ebp)]	; (FAST_FLOAT *)
 	mov	ecx, DCTSIZE
 	alignx	16,7
 .columnloop:
 	fld	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
 	fadd	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
 	fadd	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
 	fadd	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
 	fld	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
 	fadd	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]

 	; -- Even part

 	fld	st2	; st2 = st2 + st1, st1 = st2 - st1
 	fsub	st0,st2
 	fxch	st0,st2
 	faddp	st3,st0
 	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
 	fsub	st0,st1
 	fxch	st0,st1
 	faddp	st4,st0

 	fadd	st0,st1
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]

 	fld	st2	; st3 = st2 + st3, st2 = st2 - st3
 	fsub	st0,st4
 	fxch	st0,st3
 	faddp	st4,st0
 	fld	st1	; st0 = st1 + st0, st1 = st1 - st0
 	fsub	st0,st1
 	fxch	st0,st2
 	faddp	st1,st0

 	fld	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
 	fsub	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st4
 	fld	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
 	fsub	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st4
 	fld	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
 	fsub	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st4
 	fld	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
 	fsub	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
 	fxch	st0,st4

 	fstp	FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]

 	; -- Odd part

 	fadd	st2,st0
 	fadd	st0,st1
 	fxch	st0,st3
 	fadd	st1,st0
 	fxch	st0,st3

 	fld	st2
 	fxch	st0,st1
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
 	fxch	st0,st1
 	fsub	st0,st2
 	fxch	st0,st3
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
 	fxch	st0,st3
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
 	fxch	st0,st2
 	fmul	ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
 	fxch	st0,st2
 	fadd	st3,st0
 	faddp	st2,st0

 	fld	st3	; st3 = st3 + st0, st0 = st3 - st0
 	fsub	st0,st1
 	fxch	st0,st1
 	faddp	st4,st0

 	fld	st2	; st0 = st0 + st2, st2 = st0 - st2
 	fsubr	st0,st1
 	fxch	st0,st3
 	faddp	st1,st0
 	fld	st1	; st3 = st3 + st1, st1 = st3 - st1
 	fsubr	st0,st4
 	fxch	st0,st2
 	faddp	st4,st0

 	fstp	FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
 	fstp	FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]

 	add	edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column
 	dec	ecx
 	jnz	near .columnloop

 ;	pop	edi		; unused
 ;	pop	esi		; unused
 ;	pop	edx		; need not be preserved
 ;	pop	ecx		; need not be preserved
 	poppic	ebx
 	pop	ebp
 	ret

 %endif ; DCT_FLOAT_SUPPORTED
	;
	; jfdctflt.asm - floating-point FDCT (non-SIMD)
	;
	; x86 SIMD extension for IJG JPEG library
	; Copyright (C) 1999-2006, MIYASAKA Masaru.
	; For conditions of distribution and use, see copyright notice in jsimdext.inc
	;
	; This file should be assembled with NASM (Netwide Assembler),
	; can not be assembled with Microsoft's MASM or any compatible
	; assembler (including Borland's Turbo Assembler).
	; NASM is available from http://nasm.sourceforge.net/ or
	; http://sourceforge.net/project/showfiles.php?group_id=6208
	;
	; This file contains a floating-point implementation of the forward DCT
	; (Discrete Cosine Transform). The following code is based directly on
	; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
	;
	; Last Modified : October 17, 2004
	;
	; [TAB8]

	%include "jsimdext.inc"
	%include "jdct.inc"

	%ifdef DCT_FLOAT_SUPPORTED

	; This module is specialized to the case DCTSIZE = 8.
	;
	%if DCTSIZE != 8
	%error "Sorry, this code only copes with 8x8 DCTs."
	%endif

	; --------------------------------------------------------------------------
	SECTION SEG_CONST

	%define ROTATOR_TYPE FP32 ; float

	alignz 16
	global EXTN(jconst_fdct_float)

	EXTN(jconst_fdct_float):

	F_0_382 dd 0.382683432365089771728460 ; cos(PI*3/8)
	F_0_707 dd 0.707106781186547524400844 ; cos(PI*1/4)
	F_0_541 dd 0.541196100146196984399723 ; cos(PI1/8)-cos(PI3/8)
	F_1_306 dd 1.306562964876376527856643 ; cos(PI1/8)+cos(PI3/8)

	alignz 16

	; --------------------------------------------------------------------------
	SECTION SEG_TEXT
	BITS 32
	;
	; Perform the forward DCT on one block of samples.
	;
	; GLOBAL(void)
	; jpeg_fdct_float (FAST_FLOAT * data)
	;

	%define data(b) (b)+8 ; FAST_FLOAT * data

	align 16
	global EXTN(jpeg_fdct_float)

	EXTN(jpeg_fdct_float):
	push ebp
	mov ebp,esp
	pushpic ebx
	; push ecx ; need not be preserved
	; push edx ; need not be preserved
	; push esi ; unused
	; push edi ; unused

	get_GOT ebx ; get GOT address

	; ---- Pass 1: process rows.

	mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *)
	mov ecx, DCTSIZE
	alignx 16,7
	.rowloop:
	fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
	fadd FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
	fadd FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
	fadd FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
	fadd FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]

	; -- Even part

	fld st2 ; st2 = st2 + st1, st1 = st2 - st1
	fsub st0,st2
	fxch st0,st2
	faddp st3,st0
	fld st3 ; st3 = st3 + st0, st0 = st3 - st0
	fsub st0,st1
	fxch st0,st1
	faddp st4,st0

	fadd st0,st1
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]

	fld st2 ; st3 = st2 + st3, st2 = st2 - st3
	fsub st0,st4
	fxch st0,st3
	faddp st4,st0
	fld st1 ; st0 = st1 + st0, st1 = st1 - st0
	fsub st0,st1
	fxch st0,st2
	faddp st1,st0

	fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
	fsub FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
	fxch st0,st4
	fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
	fsub FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
	fxch st0,st4
	fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
	fsub FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
	fxch st0,st4
	fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
	fsub FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
	fxch st0,st4

	fstp FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]

	; -- Odd part

	fadd st2,st0
	fadd st0,st1
	fxch st0,st3
	fadd st1,st0
	fxch st0,st3

	fld st2
	fxch st0,st1
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
	fxch st0,st1
	fsub st0,st2
	fxch st0,st3
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
	fxch st0,st3
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
	fxch st0,st2
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
	fxch st0,st2
	fadd st3,st0
	faddp st2,st0

	fld st3 ; st3 = st3 + st0, st0 = st3 - st0
	fsub st0,st1
	fxch st0,st1
	faddp st4,st0

	fld st2 ; st0 = st0 + st2, st2 = st0 - st2
	fsubr st0,st1
	fxch st0,st3
	faddp st1,st0
	fld st1 ; st3 = st3 + st1, st1 = st3 - st1
	fsubr st0,st4
	fxch st0,st2
	faddp st4,st0

	fstp FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]

	add edx, byte DCTSIZE*SIZEOF_FAST_FLOAT
	dec ecx ; advance pointer to next row
	jnz near .rowloop

	; ---- Pass 2: process columns.

	mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *)
	mov ecx, DCTSIZE
	alignx 16,7
	.columnloop:
	fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
	fadd FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
	fadd FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
	fadd FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
	fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
	fadd FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]

	; -- Even part

	fld st2 ; st2 = st2 + st1, st1 = st2 - st1
	fsub st0,st2
	fxch st0,st2
	faddp st3,st0
	fld st3 ; st3 = st3 + st0, st0 = st3 - st0
	fsub st0,st1
	fxch st0,st1
	faddp st4,st0

	fadd st0,st1
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]

	fld st2 ; st3 = st2 + st3, st2 = st2 - st3
	fsub st0,st4
	fxch st0,st3
	faddp st4,st0
	fld st1 ; st0 = st1 + st0, st1 = st1 - st0
	fsub st0,st1
	fxch st0,st2
	faddp st1,st0

	fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
	fsub FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
	fxch st0,st4
	fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
	fsub FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
	fxch st0,st4
	fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
	fsub FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
	fxch st0,st4
	fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
	fsub FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
	fxch st0,st4

	fstp FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]

	; -- Odd part

	fadd st2,st0
	fadd st0,st1
	fxch st0,st3
	fadd st1,st0
	fxch st0,st3

	fld st2
	fxch st0,st1
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
	fxch st0,st1
	fsub st0,st2
	fxch st0,st3
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
	fxch st0,st3
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
	fxch st0,st2
	fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
	fxch st0,st2
	fadd st3,st0
	faddp st2,st0

	fld st3 ; st3 = st3 + st0, st0 = st3 - st0
	fsub st0,st1
	fxch st0,st1
	faddp st4,st0

	fld st2 ; st0 = st0 + st2, st2 = st0 - st2
	fsubr st0,st1
	fxch st0,st3
	faddp st1,st0
	fld st1 ; st3 = st3 + st1, st1 = st3 - st1
	fsubr st0,st4
	fxch st0,st2
	faddp st4,st0

	fstp FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
	fstp FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]

	add edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column
	dec ecx
	jnz near .columnloop

	; pop edi ; unused
	; pop esi ; unused
	; pop edx ; need not be preserved
	; pop ecx ; need not be preserved
	poppic ebx
	pop ebp
	ret

	%endif ; DCT_FLOAT_SUPPORTED