blob: 178e1f99cb5de2964d02f275762fa3d72e906d1e [file] [log] [blame]
;
; jfdctflt.asm - floating-point FDCT (non-SIMD)
;
; x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; This file contains a floating-point implementation of the forward DCT
; (Discrete Cosine Transform). The following code is based directly on
; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
;
; Last Modified : October 17, 2004
;
; [TAB8]
%include "jsimdext.inc"
%include "jdct.inc"
%ifdef DCT_FLOAT_SUPPORTED
; This module is specialized to the case DCTSIZE = 8.
;
%if DCTSIZE != 8
%error "Sorry, this code only copes with 8x8 DCTs."
%endif
; --------------------------------------------------------------------------
SECTION SEG_CONST
%define ROTATOR_TYPE FP32 ; float
alignz 16
global EXTN(jconst_fdct_float)
EXTN(jconst_fdct_float):
F_0_382 dd 0.382683432365089771728460 ; cos(PI*3/8)
F_0_707 dd 0.707106781186547524400844 ; cos(PI*1/4)
F_0_541 dd 0.541196100146196984399723 ; cos(PI*1/8)-cos(PI*3/8)
F_1_306 dd 1.306562964876376527856643 ; cos(PI*1/8)+cos(PI*3/8)
alignz 16
; --------------------------------------------------------------------------
SECTION SEG_TEXT
BITS 32
;
; Perform the forward DCT on one block of samples.
;
; GLOBAL(void)
; jpeg_fdct_float (FAST_FLOAT * data)
;
%define data(b) (b)+8 ; FAST_FLOAT * data
align 16
global EXTN(jpeg_fdct_float)
EXTN(jpeg_fdct_float):
push ebp
mov ebp,esp
pushpic ebx
; push ecx ; need not be preserved
; push edx ; need not be preserved
; push esi ; unused
; push edi ; unused
get_GOT ebx ; get GOT address
; ---- Pass 1: process rows.
mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE
alignx 16,7
.rowloop:
fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
fadd FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
fadd FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
fadd FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
fadd FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
; -- Even part
fld st2 ; st2 = st2 + st1, st1 = st2 - st1
fsub st0,st2
fxch st0,st2
faddp st3,st0
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
fsub st0,st1
fxch st0,st1
faddp st4,st0
fadd st0,st1
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
fld st2 ; st3 = st2 + st3, st2 = st2 - st3
fsub st0,st4
fxch st0,st3
faddp st4,st0
fld st1 ; st0 = st1 + st0, st1 = st1 - st0
fsub st0,st1
fxch st0,st2
faddp st1,st0
fld FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
fsub FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
fxch st0,st4
fld FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
fsub FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
fxch st0,st4
fld FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
fsub FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
fxch st0,st4
fld FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
fsub FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
fxch st0,st4
fstp FAST_FLOAT [ROW(2,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [ROW(6,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [ROW(4,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [ROW(0,edx,SIZEOF_FAST_FLOAT)]
; -- Odd part
fadd st2,st0
fadd st0,st1
fxch st0,st3
fadd st1,st0
fxch st0,st3
fld st2
fxch st0,st1
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
fxch st0,st1
fsub st0,st2
fxch st0,st3
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
fxch st0,st3
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
fxch st0,st2
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
fxch st0,st2
fadd st3,st0
faddp st2,st0
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
fsub st0,st1
fxch st0,st1
faddp st4,st0
fld st2 ; st0 = st0 + st2, st2 = st0 - st2
fsubr st0,st1
fxch st0,st3
faddp st1,st0
fld st1 ; st3 = st3 + st1, st1 = st3 - st1
fsubr st0,st4
fxch st0,st2
faddp st4,st0
fstp FAST_FLOAT [ROW(5,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [ROW(7,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [ROW(3,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [ROW(1,edx,SIZEOF_FAST_FLOAT)]
add edx, byte DCTSIZE*SIZEOF_FAST_FLOAT
dec ecx ; advance pointer to next row
jnz near .rowloop
; ---- Pass 2: process columns.
mov edx, POINTER [data(ebp)] ; (FAST_FLOAT *)
mov ecx, DCTSIZE
alignx 16,7
.columnloop:
fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
fadd FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
fadd FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
fadd FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
fadd FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
; -- Even part
fld st2 ; st2 = st2 + st1, st1 = st2 - st1
fsub st0,st2
fxch st0,st2
faddp st3,st0
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
fsub st0,st1
fxch st0,st1
faddp st4,st0
fadd st0,st1
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
fld st2 ; st3 = st2 + st3, st2 = st2 - st3
fsub st0,st4
fxch st0,st3
faddp st4,st0
fld st1 ; st0 = st1 + st0, st1 = st1 - st0
fsub st0,st1
fxch st0,st2
faddp st1,st0
fld FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
fsub FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
fxch st0,st4
fld FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
fsub FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
fxch st0,st4
fld FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
fsub FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
fxch st0,st4
fld FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
fsub FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
fxch st0,st4
fstp FAST_FLOAT [COL(2,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [COL(6,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [COL(4,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [COL(0,edx,SIZEOF_FAST_FLOAT)]
; -- Odd part
fadd st2,st0
fadd st0,st1
fxch st0,st3
fadd st1,st0
fxch st0,st3
fld st2
fxch st0,st1
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_707)]
fxch st0,st1
fsub st0,st2
fxch st0,st3
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_541)]
fxch st0,st3
fmul ROTATOR_TYPE [GOTOFF(ebx,F_0_382)]
fxch st0,st2
fmul ROTATOR_TYPE [GOTOFF(ebx,F_1_306)]
fxch st0,st2
fadd st3,st0
faddp st2,st0
fld st3 ; st3 = st3 + st0, st0 = st3 - st0
fsub st0,st1
fxch st0,st1
faddp st4,st0
fld st2 ; st0 = st0 + st2, st2 = st0 - st2
fsubr st0,st1
fxch st0,st3
faddp st1,st0
fld st1 ; st3 = st3 + st1, st1 = st3 - st1
fsubr st0,st4
fxch st0,st2
faddp st4,st0
fstp FAST_FLOAT [COL(5,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [COL(7,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [COL(3,edx,SIZEOF_FAST_FLOAT)]
fstp FAST_FLOAT [COL(1,edx,SIZEOF_FAST_FLOAT)]
add edx, byte SIZEOF_FAST_FLOAT ; advance pointer to next column
dec ecx
jnz near .columnloop
; pop edi ; unused
; pop esi ; unused
; pop edx ; need not be preserved
; pop ecx ; need not be preserved
poppic ebx
pop ebp
ret
%endif ; DCT_FLOAT_SUPPORTED