jfdctint.asm - third_party/libjpeg-turbo - Git at Google

 ;
 ; jfdctint.asm - accurate integer FDCT (non-SIMD)
 ;
 ; x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
 ;
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
 ; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains a slow-but-accurate integer implementation of the
 ; forward DCT (Discrete Cosine Transform). The following code is based
 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
 ; more details.
 ;
 ; Last Modified : October 17, 2004
 ;
 ; [TAB8]

 %include "jsimdext.inc"
 %include "jdct.inc"

 %ifdef DCT_ISLOW_SUPPORTED

 ; This module is specialized to the case DCTSIZE = 8.
 ;
 %if DCTSIZE != 8
 %error "Sorry, this code only copes with 8x8 DCTs."
 %endif

 ; --------------------------------------------------------------------------

 ; Descale and correctly round a DWORD value that's scaled by N bits.
 ;
 %macro	descale 2
 %if (%2)<=7
 	add	%1, byte (1<<((%2)-1))	; add reg32,imm8
 %else
 	add	%1, (1<<((%2)-1))	; add reg32,imm32
 %endif
 	sar	%1,%2
 %endmacro

 ; --------------------------------------------------------------------------

 %define CONST_BITS	13
 %define PASS1_BITS	2

 %if CONST_BITS == 13
 F_0_298	equ	 2446		; FIX(0.298631336)
 F_0_390	equ	 3196		; FIX(0.390180644)
 F_0_541	equ	 4433		; FIX(0.541196100)
 F_0_765	equ	 6270		; FIX(0.765366865)
 F_0_899	equ	 7373		; FIX(0.899976223)
 F_1_175	equ	 9633		; FIX(1.175875602)
 F_1_501	equ	12299		; FIX(1.501321110)
 F_1_847	equ	15137		; FIX(1.847759065)
 F_1_961	equ	16069		; FIX(1.961570560)
 F_2_053	equ	16819		; FIX(2.053119869)
 F_2_562	equ	20995		; FIX(2.562915447)
 F_3_072	equ	25172		; FIX(3.072711026)
 %else
 ; NASM cannot do compile-time arithmetic on floating-point constants.
 %define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
 F_0_298	equ	DESCALE( 320652955,30-CONST_BITS)	; FIX(0.298631336)
 F_0_390	equ	DESCALE( 418953276,30-CONST_BITS)	; FIX(0.390180644)
 F_0_541	equ	DESCALE( 581104887,30-CONST_BITS)	; FIX(0.541196100)
 F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
 F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
 F_1_175	equ	DESCALE(1262586813,30-CONST_BITS)	; FIX(1.175875602)
 F_1_501	equ	DESCALE(1612031267,30-CONST_BITS)	; FIX(1.501321110)
 F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
 F_1_961	equ	DESCALE(2106220350,30-CONST_BITS)	; FIX(1.961570560)
 F_2_053	equ	DESCALE(2204520673,30-CONST_BITS)	; FIX(2.053119869)
 F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
 F_3_072	equ	DESCALE(3299298341,30-CONST_BITS)	; FIX(3.072711026)
 %endif

 ; --------------------------------------------------------------------------
 	SECTION	SEG_TEXT
 	BITS	32
 ;
 ; Perform the forward DCT on one block of samples.
 ;
 ; GLOBAL(void)
 ; jpeg_fdct_islow (DCTELEM * data)
 ;

 %define data(b)	(b)+8		; DCTELEM * data

 	align	16
 	global	EXTN(jpeg_fdct_islow)

 EXTN(jpeg_fdct_islow):
 	push	ebp
 	mov	ebp,esp
 	push	ebx
 ;	push	ecx		; need not be preserved
 ;	push	edx		; need not be preserved
 	push	esi
 	push	edi

 	; ---- Pass 1: process rows.

 	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
 	mov	ecx, DCTSIZE
 	alignx	16,7
 .rowloop:
 	movsx	eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
 	movsx	edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
 	lea	esi,[eax+edi]	; esi=tmp0
 	sub	eax,edi		; eax=tmp7
 	push	ecx		; ctr
 	push	eax

 	movsx	ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
 	movsx	ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
 	lea	edi,[ebx+ecx]	; edi=tmp1
 	sub	ebx,ecx		; ebx=tmp6
 	push	ebx

 	movsx	eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
 	movsx	ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
 	lea	ebx,[eax+ecx]	; ebx=tmp2
 	sub	eax,ecx		; eax=tmp5
 	push	edx		; dataptr
 	push	eax

 	movsx	ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
 	movsx	eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
 	lea	edx,[ecx+eax]	; edx=tmp3
 	sub	ecx,eax		; ecx=tmp4
 	push	ecx

 	; -- Even part

 	lea	eax,[esi+edx]	; eax=tmp10
 	lea	ecx,[edi+ebx]	; ecx=tmp11
 	sub	esi,edx		; esi=tmp13
 	sub	edi,ebx		; edi=tmp12

 	lea	ebx,[eax+ecx]	; ebx=data0
 	sub	eax,ecx		; eax=data4
 	mov	edx, POINTER [esp+8]	; dataptr
 	sal	ebx, PASS1_BITS
 	sal	eax, PASS1_BITS
 	mov	DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
 	mov	DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax

 	lea	ecx,[edi+esi]
 	imul	ecx,(F_0_541)	; ecx=z1
 	imul	esi,(F_0_765)	; esi=MULTIPLY(tmp13,FIX_0_765366865)
 	imul	edi,(-F_1_847)	; edi=MULTIPLY(tmp12,-FIX_1_847759065)
 	add	esi,ecx		; esi=data2
 	add	edi,ecx		; edi=data6
 	descale	esi,(CONST_BITS-PASS1_BITS)
 	descale	edi,(CONST_BITS-PASS1_BITS)
 	mov	DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], si
 	mov	DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], di

 	; -- Odd part

 	mov	eax, INT32 [esp]	; eax=tmp4
 	mov	ebx, INT32 [esp+4]	; ebx=tmp5
 	mov	ecx, INT32 [esp+12]	; ecx=tmp6
 	mov	esi, INT32 [esp+16]	; esi=tmp7

 	lea	edx,[eax+ecx]	; edx=z3
 	lea	edi,[ebx+esi]	; edi=z4
 	add	eax,esi		; eax=z1
 	add	ebx,ecx		; ebx=z2

 	lea	esi,[edx+edi]
 	imul	esi,(F_1_175)	; esi=z5

 	imul	edx,(-F_1_961)	; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
 	imul	edi,(-F_0_390)	; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
 	imul	eax,(-F_0_899)	; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
 	imul	ebx,(-F_2_562)	; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))

 	add	edx,esi		; edx=z3(=z3+z5)
 	add	edi,esi		; edi=z4(=z4+z5)

 	lea	ecx,[eax+edx]	; ecx=z1+z3
 	lea	esi,[ebx+edi]	; esi=z2+z4
 	add	eax,edi		; eax=z1+z4
 	add	ebx,edx		; ebx=z2+z3

 	pop	edx		; edx=tmp4
 	pop	edi		; edi=tmp5
 	imul	edx,(F_0_298)	; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
 	imul	edi,(F_2_053)	; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
 	add	ecx,edx		; ecx=data7(=tmp4+z1+z3)
 	add	esi,edi		; esi=data5(=tmp5+z2+z4)
 	pop	edx		; dataptr
 	descale	ecx,(CONST_BITS-PASS1_BITS)
 	descale	esi,(CONST_BITS-PASS1_BITS)
 	mov	DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], cx
 	mov	DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], si

 	pop	edi		; edi=tmp6
 	pop	ecx		; ecx=tmp7
 	imul	edi,(F_3_072)	; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
 	imul	ecx,(F_1_501)	; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
 	add	ebx,edi		; ebx=data3(=tmp6+z2+z3)
 	add	eax,ecx		; eax=data1(=tmp7+z1+z4)
 	pop	ecx		; ctr
 	descale	ebx,(CONST_BITS-PASS1_BITS)
 	descale	eax,(CONST_BITS-PASS1_BITS)
 	mov	DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], bx
 	mov	DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], ax

 	add	edx, byte DCTSIZE*SIZEOF_DCTELEM
 	dec	ecx			; advance pointer to next row
 	jnz	near .rowloop

 	; ---- Pass 2: process columns.

 	mov	edx, POINTER [data(ebp)]	; (DCTELEM *)
 	mov	ecx, DCTSIZE
 	alignx	16,7
 .columnloop:
 	movsx	eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
 	movsx	edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
 	lea	esi,[eax+edi]	; esi=tmp0
 	sub	eax,edi		; eax=tmp7
 	push	ecx		; ctr
 	push	eax

 	movsx	ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
 	movsx	ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
 	lea	edi,[ebx+ecx]	; edi=tmp1
 	sub	ebx,ecx		; ebx=tmp6
 	push	ebx

 	movsx	eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
 	movsx	ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
 	lea	ebx,[eax+ecx]	; ebx=tmp2
 	sub	eax,ecx		; eax=tmp5
 	push	edx		; dataptr
 	push	eax

 	movsx	ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
 	movsx	eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
 	lea	edx,[ecx+eax]	; edx=tmp3
 	sub	ecx,eax		; ecx=tmp4
 	push	ecx

 	; -- Even part

 	lea	eax,[esi+edx]	; eax=tmp10
 	lea	ecx,[edi+ebx]	; ecx=tmp11
 	sub	esi,edx		; esi=tmp13
 	sub	edi,ebx		; edi=tmp12

 	lea	ebx,[eax+ecx]	; ebx=data0
 	sub	eax,ecx		; eax=data4
 	mov	edx, POINTER [esp+8]	; dataptr
 	descale	ebx, PASS1_BITS
 	descale	eax, PASS1_BITS
 	mov	DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
 	mov	DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax

 	lea	ecx,[edi+esi]
 	imul	ecx,(F_0_541)	; ecx=z1
 	imul	esi,(F_0_765)	; esi=MULTIPLY(tmp13,FIX_0_765366865)
 	imul	edi,(-F_1_847)	; edi=MULTIPLY(tmp12,-FIX_1_847759065)
 	add	esi,ecx		; esi=data2
 	add	edi,ecx		; edi=data6
 	descale	esi,(CONST_BITS+PASS1_BITS)
 	descale	edi,(CONST_BITS+PASS1_BITS)
 	mov	DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], si
 	mov	DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], di

 	; -- Odd part

 	mov	eax, INT32 [esp]	; eax=tmp4
 	mov	ebx, INT32 [esp+4]	; ebx=tmp5
 	mov	ecx, INT32 [esp+12]	; ecx=tmp6
 	mov	esi, INT32 [esp+16]	; esi=tmp7

 	lea	edx,[eax+ecx]	; edx=z3
 	lea	edi,[ebx+esi]	; edi=z4
 	add	eax,esi		; eax=z1
 	add	ebx,ecx		; ebx=z2

 	lea	esi,[edx+edi]
 	imul	esi,(F_1_175)	; esi=z5

 	imul	edx,(-F_1_961)	; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
 	imul	edi,(-F_0_390)	; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
 	imul	eax,(-F_0_899)	; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
 	imul	ebx,(-F_2_562)	; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))

 	add	edx,esi		; edx=z3(=z3+z5)
 	add	edi,esi		; edi=z4(=z4+z5)

 	lea	ecx,[eax+edx]	; ecx=z1+z3
 	lea	esi,[ebx+edi]	; esi=z2+z4
 	add	eax,edi		; eax=z1+z4
 	add	ebx,edx		; ebx=z2+z3

 	pop	edx		; edx=tmp4
 	pop	edi		; edi=tmp5
 	imul	edx,(F_0_298)	; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
 	imul	edi,(F_2_053)	; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
 	add	ecx,edx		; ecx=data7(=tmp4+z1+z3)
 	add	esi,edi		; esi=data5(=tmp5+z2+z4)
 	pop	edx		; dataptr
 	descale	ecx,(CONST_BITS+PASS1_BITS)
 	descale	esi,(CONST_BITS+PASS1_BITS)
 	mov	DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], cx
 	mov	DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], si

 	pop	edi		; edi=tmp6
 	pop	ecx		; ecx=tmp7
 	imul	edi,(F_3_072)	; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
 	imul	ecx,(F_1_501)	; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
 	add	ebx,edi		; ebx=data3(=tmp6+z2+z3)
 	add	eax,ecx		; eax=data1(=tmp7+z1+z4)
 	pop	ecx		; ctr
 	descale	ebx,(CONST_BITS+PASS1_BITS)
 	descale	eax,(CONST_BITS+PASS1_BITS)
 	mov	DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], bx
 	mov	DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], ax

 	add	edx, byte SIZEOF_DCTELEM    ; advance pointer to next column
 	dec	ecx
 	jnz	near .columnloop

 	pop	edi
 	pop	esi
 ;	pop	edx		; need not be preserved
 ;	pop	ecx		; need not be preserved
 	pop	ebx
 	pop	ebp
 	ret

 %endif ; DCT_ISLOW_SUPPORTED
	;
	; jfdctint.asm - accurate integer FDCT (non-SIMD)
	;
	; x86 SIMD extension for IJG JPEG library
	; Copyright (C) 1999-2006, MIYASAKA Masaru.
	; For conditions of distribution and use, see copyright notice in jsimdext.inc
	;
	; This file should be assembled with NASM (Netwide Assembler),
	; can not be assembled with Microsoft's MASM or any compatible
	; assembler (including Borland's Turbo Assembler).
	; NASM is available from http://nasm.sourceforge.net/ or
	; http://sourceforge.net/project/showfiles.php?group_id=6208
	;
	; This file contains a slow-but-accurate integer implementation of the
	; forward DCT (Discrete Cosine Transform). The following code is based
	; directly on the IJG's original jfdctint.c; see the jfdctint.c for
	; more details.
	;
	; Last Modified : October 17, 2004
	;
	; [TAB8]

	%include "jsimdext.inc"
	%include "jdct.inc"

	%ifdef DCT_ISLOW_SUPPORTED

	; This module is specialized to the case DCTSIZE = 8.
	;
	%if DCTSIZE != 8
	%error "Sorry, this code only copes with 8x8 DCTs."
	%endif

	; --------------------------------------------------------------------------

	; Descale and correctly round a DWORD value that's scaled by N bits.
	;
	%macro descale 2
	%if (%2)<=7
	add %1, byte (1<<((%2)-1)) ; add reg32,imm8
	%else
	add %1, (1<<((%2)-1)) ; add reg32,imm32
	%endif
	sar %1,%2
	%endmacro

	; --------------------------------------------------------------------------

	%define CONST_BITS 13
	%define PASS1_BITS 2

	%if CONST_BITS == 13
	F_0_298 equ 2446 ; FIX(0.298631336)
	F_0_390 equ 3196 ; FIX(0.390180644)
	F_0_541 equ 4433 ; FIX(0.541196100)
	F_0_765 equ 6270 ; FIX(0.765366865)
	F_0_899 equ 7373 ; FIX(0.899976223)
	F_1_175 equ 9633 ; FIX(1.175875602)
	F_1_501 equ 12299 ; FIX(1.501321110)
	F_1_847 equ 15137 ; FIX(1.847759065)
	F_1_961 equ 16069 ; FIX(1.961570560)
	F_2_053 equ 16819 ; FIX(2.053119869)
	F_2_562 equ 20995 ; FIX(2.562915447)
	F_3_072 equ 25172 ; FIX(3.072711026)
	%else
	; NASM cannot do compile-time arithmetic on floating-point constants.
	%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
	F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
	F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
	F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
	F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
	F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
	F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
	F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
	F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
	F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
	F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
	F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
	F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
	%endif

	; --------------------------------------------------------------------------
	SECTION SEG_TEXT
	BITS 32
	;
	; Perform the forward DCT on one block of samples.
	;
	; GLOBAL(void)
	; jpeg_fdct_islow (DCTELEM * data)
	;

	%define data(b) (b)+8 ; DCTELEM * data

	align 16
	global EXTN(jpeg_fdct_islow)

	EXTN(jpeg_fdct_islow):
	push ebp
	mov ebp,esp
	push ebx
	; push ecx ; need not be preserved
	; push edx ; need not be preserved
	push esi
	push edi

	; ---- Pass 1: process rows.

	mov edx, POINTER [data(ebp)] ; (DCTELEM *)
	mov ecx, DCTSIZE
	alignx 16,7
	.rowloop:
	movsx eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
	movsx edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
	lea esi,[eax+edi] ; esi=tmp0
	sub eax,edi ; eax=tmp7
	push ecx ; ctr
	push eax

	movsx ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
	movsx ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
	lea edi,[ebx+ecx] ; edi=tmp1
	sub ebx,ecx ; ebx=tmp6
	push ebx

	movsx eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
	movsx ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
	lea ebx,[eax+ecx] ; ebx=tmp2
	sub eax,ecx ; eax=tmp5
	push edx ; dataptr
	push eax

	movsx ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
	movsx eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
	lea edx,[ecx+eax] ; edx=tmp3
	sub ecx,eax ; ecx=tmp4
	push ecx

	; -- Even part

	lea eax,[esi+edx] ; eax=tmp10
	lea ecx,[edi+ebx] ; ecx=tmp11
	sub esi,edx ; esi=tmp13
	sub edi,ebx ; edi=tmp12

	lea ebx,[eax+ecx] ; ebx=data0
	sub eax,ecx ; eax=data4
	mov edx, POINTER [esp+8] ; dataptr
	sal ebx, PASS1_BITS
	sal eax, PASS1_BITS
	mov DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
	mov DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax

	lea ecx,[edi+esi]
	imul ecx,(F_0_541) ; ecx=z1
	imul esi,(F_0_765) ; esi=MULTIPLY(tmp13,FIX_0_765366865)
	imul edi,(-F_1_847) ; edi=MULTIPLY(tmp12,-FIX_1_847759065)
	add esi,ecx ; esi=data2
	add edi,ecx ; edi=data6
	descale esi,(CONST_BITS-PASS1_BITS)
	descale edi,(CONST_BITS-PASS1_BITS)
	mov DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], si
	mov DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], di

	; -- Odd part

	mov eax, INT32 [esp] ; eax=tmp4
	mov ebx, INT32 [esp+4] ; ebx=tmp5
	mov ecx, INT32 [esp+12] ; ecx=tmp6
	mov esi, INT32 [esp+16] ; esi=tmp7

	lea edx,[eax+ecx] ; edx=z3
	lea edi,[ebx+esi] ; edi=z4
	add eax,esi ; eax=z1
	add ebx,ecx ; ebx=z2

	lea esi,[edx+edi]
	imul esi,(F_1_175) ; esi=z5

	imul edx,(-F_1_961) ; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
	imul edi,(-F_0_390) ; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
	imul eax,(-F_0_899) ; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
	imul ebx,(-F_2_562) ; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))

	add edx,esi ; edx=z3(=z3+z5)
	add edi,esi ; edi=z4(=z4+z5)

	lea ecx,[eax+edx] ; ecx=z1+z3
	lea esi,[ebx+edi] ; esi=z2+z4
	add eax,edi ; eax=z1+z4
	add ebx,edx ; ebx=z2+z3

	pop edx ; edx=tmp4
	pop edi ; edi=tmp5
	imul edx,(F_0_298) ; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
	imul edi,(F_2_053) ; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
	add ecx,edx ; ecx=data7(=tmp4+z1+z3)
	add esi,edi ; esi=data5(=tmp5+z2+z4)
	pop edx ; dataptr
	descale ecx,(CONST_BITS-PASS1_BITS)
	descale esi,(CONST_BITS-PASS1_BITS)
	mov DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], cx
	mov DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], si

	pop edi ; edi=tmp6
	pop ecx ; ecx=tmp7
	imul edi,(F_3_072) ; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
	imul ecx,(F_1_501) ; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
	add ebx,edi ; ebx=data3(=tmp6+z2+z3)
	add eax,ecx ; eax=data1(=tmp7+z1+z4)
	pop ecx ; ctr
	descale ebx,(CONST_BITS-PASS1_BITS)
	descale eax,(CONST_BITS-PASS1_BITS)
	mov DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], bx
	mov DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], ax

	add edx, byte DCTSIZE*SIZEOF_DCTELEM
	dec ecx ; advance pointer to next row
	jnz near .rowloop

	; ---- Pass 2: process columns.

	mov edx, POINTER [data(ebp)] ; (DCTELEM *)
	mov ecx, DCTSIZE
	alignx 16,7
	.columnloop:
	movsx eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
	movsx edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
	lea esi,[eax+edi] ; esi=tmp0
	sub eax,edi ; eax=tmp7
	push ecx ; ctr
	push eax

	movsx ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
	movsx ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
	lea edi,[ebx+ecx] ; edi=tmp1
	sub ebx,ecx ; ebx=tmp6
	push ebx

	movsx eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
	movsx ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
	lea ebx,[eax+ecx] ; ebx=tmp2
	sub eax,ecx ; eax=tmp5
	push edx ; dataptr
	push eax

	movsx ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
	movsx eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
	lea edx,[ecx+eax] ; edx=tmp3
	sub ecx,eax ; ecx=tmp4
	push ecx

	; -- Even part

	lea eax,[esi+edx] ; eax=tmp10
	lea ecx,[edi+ebx] ; ecx=tmp11
	sub esi,edx ; esi=tmp13
	sub edi,ebx ; edi=tmp12

	lea ebx,[eax+ecx] ; ebx=data0
	sub eax,ecx ; eax=data4
	mov edx, POINTER [esp+8] ; dataptr
	descale ebx, PASS1_BITS
	descale eax, PASS1_BITS
	mov DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
	mov DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax

	lea ecx,[edi+esi]
	imul ecx,(F_0_541) ; ecx=z1
	imul esi,(F_0_765) ; esi=MULTIPLY(tmp13,FIX_0_765366865)
	imul edi,(-F_1_847) ; edi=MULTIPLY(tmp12,-FIX_1_847759065)
	add esi,ecx ; esi=data2
	add edi,ecx ; edi=data6
	descale esi,(CONST_BITS+PASS1_BITS)
	descale edi,(CONST_BITS+PASS1_BITS)
	mov DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], si
	mov DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], di

	; -- Odd part

	mov eax, INT32 [esp] ; eax=tmp4
	mov ebx, INT32 [esp+4] ; ebx=tmp5
	mov ecx, INT32 [esp+12] ; ecx=tmp6
	mov esi, INT32 [esp+16] ; esi=tmp7

	lea edx,[eax+ecx] ; edx=z3
	lea edi,[ebx+esi] ; edi=z4
	add eax,esi ; eax=z1
	add ebx,ecx ; ebx=z2

	lea esi,[edx+edi]
	imul esi,(F_1_175) ; esi=z5

	imul edx,(-F_1_961) ; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
	imul edi,(-F_0_390) ; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
	imul eax,(-F_0_899) ; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
	imul ebx,(-F_2_562) ; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))

	add edx,esi ; edx=z3(=z3+z5)
	add edi,esi ; edi=z4(=z4+z5)

	lea ecx,[eax+edx] ; ecx=z1+z3
	lea esi,[ebx+edi] ; esi=z2+z4
	add eax,edi ; eax=z1+z4
	add ebx,edx ; ebx=z2+z3

	pop edx ; edx=tmp4
	pop edi ; edi=tmp5
	imul edx,(F_0_298) ; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
	imul edi,(F_2_053) ; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
	add ecx,edx ; ecx=data7(=tmp4+z1+z3)
	add esi,edi ; esi=data5(=tmp5+z2+z4)
	pop edx ; dataptr
	descale ecx,(CONST_BITS+PASS1_BITS)
	descale esi,(CONST_BITS+PASS1_BITS)
	mov DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], cx
	mov DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], si

	pop edi ; edi=tmp6
	pop ecx ; ecx=tmp7
	imul edi,(F_3_072) ; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
	imul ecx,(F_1_501) ; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
	add ebx,edi ; ebx=data3(=tmp6+z2+z3)
	add eax,ecx ; eax=data1(=tmp7+z1+z4)
	pop ecx ; ctr
	descale ebx,(CONST_BITS+PASS1_BITS)
	descale eax,(CONST_BITS+PASS1_BITS)
	mov DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], bx
	mov DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], ax

	add edx, byte SIZEOF_DCTELEM ; advance pointer to next column
	dec ecx
	jnz near .columnloop

	pop edi
	pop esi
	; pop edx ; need not be preserved
	; pop ecx ; need not be preserved
	pop ebx
	pop ebp
	ret

	%endif ; DCT_ISLOW_SUPPORTED