blob: 0f2972510ee02bd8e7d8dc580b17bfe307bfeafe [file] [log] [blame]
;
; jfdctint.asm - accurate integer FDCT (non-SIMD)
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a slow-but-accurate integer implementation of the
; forward DCT (Discrete Cosine Transform). The following code is based
; directly on the IJG's original jfdctint.c; see the jfdctint.c for
; more details.
;
; Last Modified : October 17, 2004
;
; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
%ifdef DCT_ISLOW_SUPPORTED
; This module is specialized to the case DCTSIZE = 8.
;
%if DCTSIZE != 8
%error "Sorry, this code only copes with 8x8 DCTs."
%endif
; --------------------------------------------------------------------------
; Descale and correctly round a DWORD value that's scaled by N bits.
;
%macro descale 2
%if (%2)<=7
add %1, byte (1<<((%2)-1)) ; add reg32,imm8
%else
add %1, (1<<((%2)-1)) ; add reg32,imm32
%endif
sar %1,%2
%endmacro
; --------------------------------------------------------------------------
%define CONST_BITS 13
%define PASS1_BITS 2
%if CONST_BITS == 13
F_0_298 equ 2446 ; FIX(0.298631336)
F_0_390 equ 3196 ; FIX(0.390180644)
F_0_541 equ 4433 ; FIX(0.541196100)
F_0_765 equ 6270 ; FIX(0.765366865)
F_0_899 equ 7373 ; FIX(0.899976223)
F_1_175 equ 9633 ; FIX(1.175875602)
F_1_501 equ 12299 ; FIX(1.501321110)
F_1_847 equ 15137 ; FIX(1.847759065)
F_1_961 equ 16069 ; FIX(1.961570560)
F_2_053 equ 16819 ; FIX(2.053119869)
F_2_562 equ 20995 ; FIX(2.562915447)
F_3_072 equ 25172 ; FIX(3.072711026)
%else
; NASM cannot do compile-time arithmetic on floating-point constants.
%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
%endif
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jpeg_fdct_islow (DCTELEM * data)
;
%define data(b) (b)+8 ; DCTELEM * data
align 16
global EXTN(jpeg_fdct_islow)
EXTN(jpeg_fdct_islow):
push ebp
mov ebp,esp
push ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
push esi
push edi
; ---- Pass 1: process rows.
mov edx, POINTER [data(ebp)] ; (DCTELEM *)
mov ecx, DCTSIZE
alignx 16,7
.rowloop:
movsx eax, DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)]
movsx edi, DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)]
lea esi,[eax+edi] ; esi=tmp0
sub eax,edi ; eax=tmp7
push ecx ; ctr
push eax
movsx ebx, DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)]
movsx ecx, DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)]
lea edi,[ebx+ecx] ; edi=tmp1
sub ebx,ecx ; ebx=tmp6
push ebx
movsx eax, DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)]
movsx ecx, DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)]
lea ebx,[eax+ecx] ; ebx=tmp2
sub eax,ecx ; eax=tmp5
push edx ; dataptr
push eax
movsx ecx, DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)]
movsx eax, DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)]
lea edx,[ecx+eax] ; edx=tmp3
sub ecx,eax ; ecx=tmp4
push ecx
; -- Even part
lea eax,[esi+edx] ; eax=tmp10
lea ecx,[edi+ebx] ; ecx=tmp11
sub esi,edx ; esi=tmp13
sub edi,ebx ; edi=tmp12
lea ebx,[eax+ecx] ; ebx=data0
sub eax,ecx ; eax=data4
mov edx, POINTER [esp+8] ; dataptr
sal ebx, PASS1_BITS
sal eax, PASS1_BITS
mov DCTELEM [ROW(0,edx,SIZEOF_DCTELEM)], bx
mov DCTELEM [ROW(4,edx,SIZEOF_DCTELEM)], ax
lea ecx,[edi+esi]
imul ecx,(F_0_541) ; ecx=z1
imul esi,(F_0_765) ; esi=MULTIPLY(tmp13,FIX_0_765366865)
imul edi,(-F_1_847) ; edi=MULTIPLY(tmp12,-FIX_1_847759065)
add esi,ecx ; esi=data2
add edi,ecx ; edi=data6
descale esi,(CONST_BITS-PASS1_BITS)
descale edi,(CONST_BITS-PASS1_BITS)
mov DCTELEM [ROW(2,edx,SIZEOF_DCTELEM)], si
mov DCTELEM [ROW(6,edx,SIZEOF_DCTELEM)], di
; -- Odd part
mov eax, INT32 [esp] ; eax=tmp4
mov ebx, INT32 [esp+4] ; ebx=tmp5
mov ecx, INT32 [esp+12] ; ecx=tmp6
mov esi, INT32 [esp+16] ; esi=tmp7
lea edx,[eax+ecx] ; edx=z3
lea edi,[ebx+esi] ; edi=z4
add eax,esi ; eax=z1
add ebx,ecx ; ebx=z2
lea esi,[edx+edi]
imul esi,(F_1_175) ; esi=z5
imul edx,(-F_1_961) ; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
imul edi,(-F_0_390) ; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
imul eax,(-F_0_899) ; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
imul ebx,(-F_2_562) ; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))
add edx,esi ; edx=z3(=z3+z5)
add edi,esi ; edi=z4(=z4+z5)
lea ecx,[eax+edx] ; ecx=z1+z3
lea esi,[ebx+edi] ; esi=z2+z4
add eax,edi ; eax=z1+z4
add ebx,edx ; ebx=z2+z3
pop edx ; edx=tmp4
pop edi ; edi=tmp5
imul edx,(F_0_298) ; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
imul edi,(F_2_053) ; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
add ecx,edx ; ecx=data7(=tmp4+z1+z3)
add esi,edi ; esi=data5(=tmp5+z2+z4)
pop edx ; dataptr
descale ecx,(CONST_BITS-PASS1_BITS)
descale esi,(CONST_BITS-PASS1_BITS)
mov DCTELEM [ROW(7,edx,SIZEOF_DCTELEM)], cx
mov DCTELEM [ROW(5,edx,SIZEOF_DCTELEM)], si
pop edi ; edi=tmp6
pop ecx ; ecx=tmp7
imul edi,(F_3_072) ; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
imul ecx,(F_1_501) ; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
add ebx,edi ; ebx=data3(=tmp6+z2+z3)
add eax,ecx ; eax=data1(=tmp7+z1+z4)
pop ecx ; ctr
descale ebx,(CONST_BITS-PASS1_BITS)
descale eax,(CONST_BITS-PASS1_BITS)
mov DCTELEM [ROW(3,edx,SIZEOF_DCTELEM)], bx
mov DCTELEM [ROW(1,edx,SIZEOF_DCTELEM)], ax
add edx, byte DCTSIZE*SIZEOF_DCTELEM
dec ecx ; advance pointer to next row
jnz near .rowloop
; ---- Pass 2: process columns.
mov edx, POINTER [data(ebp)] ; (DCTELEM *)
mov ecx, DCTSIZE
alignx 16,7
.columnloop:
movsx eax, DCTELEM [COL(0,edx,SIZEOF_DCTELEM)]
movsx edi, DCTELEM [COL(7,edx,SIZEOF_DCTELEM)]
lea esi,[eax+edi] ; esi=tmp0
sub eax,edi ; eax=tmp7
push ecx ; ctr
push eax
movsx ebx, DCTELEM [COL(1,edx,SIZEOF_DCTELEM)]
movsx ecx, DCTELEM [COL(6,edx,SIZEOF_DCTELEM)]
lea edi,[ebx+ecx] ; edi=tmp1
sub ebx,ecx ; ebx=tmp6
push ebx
movsx eax, DCTELEM [COL(2,edx,SIZEOF_DCTELEM)]
movsx ecx, DCTELEM [COL(5,edx,SIZEOF_DCTELEM)]
lea ebx,[eax+ecx] ; ebx=tmp2
sub eax,ecx ; eax=tmp5
push edx ; dataptr
push eax
movsx ecx, DCTELEM [COL(3,edx,SIZEOF_DCTELEM)]
movsx eax, DCTELEM [COL(4,edx,SIZEOF_DCTELEM)]
lea edx,[ecx+eax] ; edx=tmp3
sub ecx,eax ; ecx=tmp4
push ecx
; -- Even part
lea eax,[esi+edx] ; eax=tmp10
lea ecx,[edi+ebx] ; ecx=tmp11
sub esi,edx ; esi=tmp13
sub edi,ebx ; edi=tmp12
lea ebx,[eax+ecx] ; ebx=data0
sub eax,ecx ; eax=data4
mov edx, POINTER [esp+8] ; dataptr
descale ebx, PASS1_BITS
descale eax, PASS1_BITS
mov DCTELEM [COL(0,edx,SIZEOF_DCTELEM)], bx
mov DCTELEM [COL(4,edx,SIZEOF_DCTELEM)], ax
lea ecx,[edi+esi]
imul ecx,(F_0_541) ; ecx=z1
imul esi,(F_0_765) ; esi=MULTIPLY(tmp13,FIX_0_765366865)
imul edi,(-F_1_847) ; edi=MULTIPLY(tmp12,-FIX_1_847759065)
add esi,ecx ; esi=data2
add edi,ecx ; edi=data6
descale esi,(CONST_BITS+PASS1_BITS)
descale edi,(CONST_BITS+PASS1_BITS)
mov DCTELEM [COL(2,edx,SIZEOF_DCTELEM)], si
mov DCTELEM [COL(6,edx,SIZEOF_DCTELEM)], di
; -- Odd part
mov eax, INT32 [esp] ; eax=tmp4
mov ebx, INT32 [esp+4] ; ebx=tmp5
mov ecx, INT32 [esp+12] ; ecx=tmp6
mov esi, INT32 [esp+16] ; esi=tmp7
lea edx,[eax+ecx] ; edx=z3
lea edi,[ebx+esi] ; edi=z4
add eax,esi ; eax=z1
add ebx,ecx ; ebx=z2
lea esi,[edx+edi]
imul esi,(F_1_175) ; esi=z5
imul edx,(-F_1_961) ; edx=z3(=MULTIPLY(z3,-FIX_1_961570560))
imul edi,(-F_0_390) ; edi=z4(=MULTIPLY(z4,-FIX_0_390180644))
imul eax,(-F_0_899) ; eax=z1(=MULTIPLY(z1,-FIX_0_899976223))
imul ebx,(-F_2_562) ; ebx=z2(=MULTIPLY(z2,-FIX_2_562915447))
add edx,esi ; edx=z3(=z3+z5)
add edi,esi ; edi=z4(=z4+z5)
lea ecx,[eax+edx] ; ecx=z1+z3
lea esi,[ebx+edi] ; esi=z2+z4
add eax,edi ; eax=z1+z4
add ebx,edx ; ebx=z2+z3
pop edx ; edx=tmp4
pop edi ; edi=tmp5
imul edx,(F_0_298) ; edx=tmp4(=MULTIPLY(tmp4,FIX_0_298631336))
imul edi,(F_2_053) ; edi=tmp5(=MULTIPLY(tmp5,FIX_2_053119869))
add ecx,edx ; ecx=data7(=tmp4+z1+z3)
add esi,edi ; esi=data5(=tmp5+z2+z4)
pop edx ; dataptr
descale ecx,(CONST_BITS+PASS1_BITS)
descale esi,(CONST_BITS+PASS1_BITS)
mov DCTELEM [COL(7,edx,SIZEOF_DCTELEM)], cx
mov DCTELEM [COL(5,edx,SIZEOF_DCTELEM)], si
pop edi ; edi=tmp6
pop ecx ; ecx=tmp7
imul edi,(F_3_072) ; edi=tmp6(=MULTIPLY(tmp6,FIX_3_072711026))
imul ecx,(F_1_501) ; ecx=tmp7(=MULTIPLY(tmp7,FIX_1_501321110))
add ebx,edi ; ebx=data3(=tmp6+z2+z3)
add eax,ecx ; eax=data1(=tmp7+z1+z4)
pop ecx ; ctr
descale ebx,(CONST_BITS+PASS1_BITS)
descale eax,(CONST_BITS+PASS1_BITS)
mov DCTELEM [COL(3,edx,SIZEOF_DCTELEM)], bx
mov DCTELEM [COL(1,edx,SIZEOF_DCTELEM)], ax
add edx, byte SIZEOF_DCTELEM ; advance pointer to next column
dec ecx
jnz near .columnloop
pop edi
pop esi
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
pop ebx
pop ebp
ret
%endif ; DCT_ISLOW_SUPPORTED