blob: 8fe1b50382f94a2b58e08a4f018e238f53af291f [file] [log] [blame]
/*
* Copyright © 2008 Mozilla Corporation
* Copyright © 2010 Nokia Corporation
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of Mozilla Corporation not be used in
* advertising or publicity pertaining to distribution of the software without
* specific, written prior permission. Mozilla Corporation makes no
* representations about the suitability of this software for any purpose. It
* is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
* Author: Jeff Muizelaar (jeff@infidigm.net)
*
*/
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
.text
.arch armv6
.object_arch armv4
.arm
.altmacro
.p2align 2
/* Supplementary macro for setting function attributes */
.macro pixman_asm_function fname
.func fname
.global fname
#ifdef __ELF__
.hidden fname
.type fname, %function
#endif
fname:
.endm
/*
* The code below was generated by gcc 4.3.4 from the commented out
* functions in 'pixman-arm-simd.c' file with the following optimization
* options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
*
* TODO: replace gcc generated code with hand tuned versions because
* the code quality is not very good, introduce symbolic register
* aliases for better readability and maintainability.
*/
pixman_asm_function pixman_composite_add_8_8_asm_armv6
push {r4, r5, r6, r7, r8, r9, r10, r11}
mov r10, r1
sub sp, sp, #4
subs r10, r10, #1
mov r11, r0
mov r8, r2
str r3, [sp]
ldr r7, [sp, #36]
bcc 0f
6: cmp r11, #0
beq 1f
orr r3, r8, r7
tst r3, #3
beq 2f
mov r1, r8
mov r0, r7
mov r12, r11
b 3f
5: tst r3, #3
beq 4f
3: ldrb r2, [r0], #1
subs r12, r12, #1
ldrb r3, [r1]
uqadd8 r3, r2, r3
strb r3, [r1], #1
orr r3, r1, r0
bne 5b
1: ldr r3, [sp]
add r8, r8, r3
ldr r3, [sp, #40]
add r7, r7, r3
10: subs r10, r10, #1
bcs 6b
0: add sp, sp, #4
pop {r4, r5, r6, r7, r8, r9, r10, r11}
bx lr
2: mov r12, r11
mov r1, r8
mov r0, r7
4: cmp r12, #3
subgt r6, r12, #4
movgt r9, r12
lsrgt r5, r6, #2
addgt r3, r5, #1
movgt r12, #0
lslgt r4, r3, #2
ble 7f
8: ldr r3, [r0, r12]
ldr r2, [r1, r12]
uqadd8 r3, r3, r2
str r3, [r1, r12]
add r12, r12, #4
cmp r12, r4
bne 8b
sub r3, r9, #4
bic r3, r3, #3
add r3, r3, #4
subs r12, r6, r5, lsl #2
add r1, r1, r3
add r0, r0, r3
beq 1b
7: mov r4, #0
9: ldrb r3, [r1, r4]
ldrb r2, [r0, r4]
uqadd8 r3, r2, r3
strb r3, [r1, r4]
add r4, r4, #1
cmp r4, r12
bne 9b
ldr r3, [sp]
add r8, r8, r3
ldr r3, [sp, #40]
add r7, r7, r3
b 10b
.endfunc
pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
push {r4, r5, r6, r7, r8, r9, r10, r11}
sub sp, sp, #20
cmp r1, #0
mov r12, r2
str r1, [sp, #12]
str r0, [sp, #16]
ldr r2, [sp, #52]
beq 0f
lsl r3, r3, #2
str r3, [sp]
ldr r3, [sp, #56]
mov r10, #0
lsl r3, r3, #2
str r3, [sp, #8]
mov r11, r3
b 1f
6: ldr r11, [sp, #8]
1: ldr r9, [sp]
mov r0, r12
add r12, r12, r9
mov r1, r2
str r12, [sp, #4]
add r2, r2, r11
ldr r12, [sp, #16]
ldr r3, =0x00800080
ldr r9, =0xff00ff00
mov r11, #255
cmp r12, #0
beq 4f
5: ldr r5, [r1], #4
ldr r4, [r0]
sub r8, r11, r5, lsr #24
uxtb16 r6, r4
uxtb16 r7, r4, ror #8
mla r6, r6, r8, r3
mla r7, r7, r8, r3
uxtab16 r6, r6, r6, ror #8
uxtab16 r7, r7, r7, ror #8
and r7, r7, r9
uxtab16 r6, r7, r6, ror #8
uqadd8 r5, r6, r5
str r5, [r0], #4
subs r12, r12, #1
bne 5b
4: ldr r3, [sp, #12]
add r10, r10, #1
cmp r10, r3
ldr r12, [sp, #4]
bne 6b
0: add sp, sp, #20
pop {r4, r5, r6, r7, r8, r9, r10, r11}
bx lr
.endfunc
pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
push {r4, r5, r6, r7, r8, r9, r10, r11}
sub sp, sp, #28
cmp r1, #0
str r1, [sp, #12]
ldrb r1, [sp, #71]
mov r12, r2
str r0, [sp, #16]
ldr r2, [sp, #60]
str r1, [sp, #24]
beq 0f
lsl r3, r3, #2
str r3, [sp, #20]
ldr r3, [sp, #64]
mov r10, #0
lsl r3, r3, #2
str r3, [sp, #8]
mov r11, r3
b 1f
5: ldr r11, [sp, #8]
1: ldr r4, [sp, #20]
mov r0, r12
mov r1, r2
add r12, r12, r4
add r2, r2, r11
str r12, [sp]
str r2, [sp, #4]
ldr r12, [sp, #16]
ldr r2, =0x00800080
ldr r3, [sp, #24]
mov r11, #255
cmp r12, #0
beq 3f
4: ldr r5, [r1], #4
ldr r4, [r0]
uxtb16 r6, r5
uxtb16 r7, r5, ror #8
mla r6, r6, r3, r2
mla r7, r7, r3, r2
uxtab16 r6, r6, r6, ror #8
uxtab16 r7, r7, r7, ror #8
uxtb16 r6, r6, ror #8
uxtb16 r7, r7, ror #8
orr r5, r6, r7, lsl #8
uxtb16 r6, r4
uxtb16 r7, r4, ror #8
sub r8, r11, r5, lsr #24
mla r6, r6, r8, r2
mla r7, r7, r8, r2
uxtab16 r6, r6, r6, ror #8
uxtab16 r7, r7, r7, ror #8
uxtb16 r6, r6, ror #8
uxtb16 r7, r7, ror #8
orr r6, r6, r7, lsl #8
uqadd8 r5, r6, r5
str r5, [r0], #4
subs r12, r12, #1
bne 4b
3: ldr r1, [sp, #12]
add r10, r10, #1
cmp r10, r1
ldr r12, [sp]
ldr r2, [sp, #4]
bne 5b
0: add sp, sp, #28
pop {r4, r5, r6, r7, r8, r9, r10, r11}
bx lr
.endfunc
pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
push {r4, r5, r6, r7, r8, r9, r10, r11}
sub sp, sp, #28
cmp r1, #0
ldr r9, [sp, #60]
str r1, [sp, #12]
bic r1, r9, #-16777216
str r1, [sp, #20]
mov r12, r2
lsr r1, r9, #8
ldr r2, [sp, #20]
bic r1, r1, #-16777216
bic r2, r2, #65280
bic r1, r1, #65280
str r2, [sp, #20]
str r0, [sp, #16]
str r1, [sp, #4]
ldr r2, [sp, #68]
beq 0f
lsl r3, r3, #2
str r3, [sp, #24]
mov r0, #0
b 1f
5: ldr r3, [sp, #24]
1: ldr r4, [sp, #72]
mov r10, r12
mov r1, r2
add r12, r12, r3
add r2, r2, r4
str r12, [sp, #8]
str r2, [sp]
ldr r12, [sp, #16]
ldr r11, =0x00800080
ldr r2, [sp, #4]
ldr r3, [sp, #20]
cmp r12, #0
beq 3f
4: ldrb r5, [r1], #1
ldr r4, [r10]
mla r6, r3, r5, r11
mla r7, r2, r5, r11
uxtab16 r6, r6, r6, ror #8
uxtab16 r7, r7, r7, ror #8
uxtb16 r6, r6, ror #8
uxtb16 r7, r7, ror #8
orr r5, r6, r7, lsl #8
uxtb16 r6, r4
uxtb16 r7, r4, ror #8
mvn r8, r5
lsr r8, r8, #24
mla r6, r6, r8, r11
mla r7, r7, r8, r11
uxtab16 r6, r6, r6, ror #8
uxtab16 r7, r7, r7, ror #8
uxtb16 r6, r6, ror #8
uxtb16 r7, r7, ror #8
orr r6, r6, r7, lsl #8
uqadd8 r5, r6, r5
str r5, [r10], #4
subs r12, r12, #1
bne 4b
3: ldr r4, [sp, #12]
add r0, r0, #1
cmp r0, r4
ldr r12, [sp, #8]
ldr r2, [sp]
bne 5b
0: add sp, sp, #28
pop {r4, r5, r6, r7, r8, r9, r10, r11}
bx lr
.endfunc
/*
* Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
* be split into a few variants, tuned for each microarchitecture.
*
* TODO: In order to get good performance on ARM9/ARM11 cores (which don't
* have efficient write combining), it needs to be changed to use 16-byte
* aligned writes using STM instruction.
*
* Nearest scanline scaler macro template uses the following arguments:
* fname - name of the function to generate
* bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
* t - type suffix for LDR/STR instructions
* prefetch_distance - prefetch in the source image by that many
* pixels ahead
* prefetch_braking_distance - stop prefetching when that many pixels are
* remaining before the end of scanline
*/
.macro generate_nearest_scanline_func fname, bpp_shift, t, \
prefetch_distance, \
prefetch_braking_distance
pixman_asm_function fname
W .req r0
DST .req r1
SRC .req r2
VX .req r3
UNIT_X .req ip
TMP1 .req r4
TMP2 .req r5
VXMASK .req r6
PF_OFFS .req r7
ldr UNIT_X, [sp]
push {r4, r5, r6, r7}
mvn VXMASK, #((1 << bpp_shift) - 1)
/* define helper macro */
.macro scale_2_pixels
ldr&t TMP1, [SRC, TMP1]
and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
str&t TMP1, [DST], #(1 << bpp_shift)
ldr&t TMP2, [SRC, TMP2]
and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
str&t TMP2, [DST], #(1 << bpp_shift)
.endm
/* now do the scaling */
and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
subs W, W, #(8 + prefetch_braking_distance)
blt 2f
/* calculate prefetch offset */
mov PF_OFFS, #prefetch_distance
mla PF_OFFS, UNIT_X, PF_OFFS, VX
1: /* main loop, process 8 pixels per iteration with prefetch */
subs W, W, #8
add PF_OFFS, UNIT_X, lsl #3
scale_2_pixels
scale_2_pixels
scale_2_pixels
scale_2_pixels
pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
bge 1b
2:
subs W, W, #(4 - 8 - prefetch_braking_distance)
blt 2f
1: /* process the remaining pixels */
scale_2_pixels
scale_2_pixels
subs W, W, #4
bge 1b
2:
tst W, #2
beq 2f
scale_2_pixels
2:
tst W, #1
ldrne&t TMP1, [SRC, TMP1]
strne&t TMP1, [DST]
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
.unreq SRC
.unreq W
.unreq VX
.unreq UNIT_X
.unreq TMP1
.unreq TMP2
.unreq VXMASK
.unreq PF_OFFS
/* return */
pop {r4, r5, r6, r7}
bx lr
.endfunc
.endm
generate_nearest_scanline_func \
pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
generate_nearest_scanline_func \
pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32