| /* |
| * Copyright © 2008 Mozilla Corporation |
| * Copyright © 2010 Nokia Corporation |
| * |
| * Permission to use, copy, modify, distribute, and sell this software and its |
| * documentation for any purpose is hereby granted without fee, provided that |
| * the above copyright notice appear in all copies and that both that |
| * copyright notice and this permission notice appear in supporting |
| * documentation, and that the name of Mozilla Corporation not be used in |
| * advertising or publicity pertaining to distribution of the software without |
| * specific, written prior permission. Mozilla Corporation makes no |
| * representations about the suitability of this software for any purpose. It |
| * is provided "as is" without express or implied warranty. |
| * |
| * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
| * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
| * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
| * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
| * SOFTWARE. |
| * |
| * Author: Jeff Muizelaar (jeff@infidigm.net) |
| * |
| */ |
| |
| /* Prevent the stack from becoming executable */ |
| #if defined(__linux__) && defined(__ELF__) |
| .section .note.GNU-stack,"",%progbits |
| #endif |
| |
| .text |
| .arch armv6 |
| .object_arch armv4 |
| .arm |
| .altmacro |
| .p2align 2 |
| |
| /* Supplementary macro for setting function attributes */ |
| .macro pixman_asm_function fname |
| .func fname |
| .global fname |
| #ifdef __ELF__ |
| .hidden fname |
| .type fname, %function |
| #endif |
| fname: |
| .endm |
| |
| /* |
| * The code below was generated by gcc 4.3.4 from the commented out |
| * functions in 'pixman-arm-simd.c' file with the following optimization |
| * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer" |
| * |
| * TODO: replace gcc generated code with hand tuned versions because |
| * the code quality is not very good, introduce symbolic register |
| * aliases for better readability and maintainability. |
| */ |
| |
| pixman_asm_function pixman_composite_add_8_8_asm_armv6 |
| push {r4, r5, r6, r7, r8, r9, r10, r11} |
| mov r10, r1 |
| sub sp, sp, #4 |
| subs r10, r10, #1 |
| mov r11, r0 |
| mov r8, r2 |
| str r3, [sp] |
| ldr r7, [sp, #36] |
| bcc 0f |
| 6: cmp r11, #0 |
| beq 1f |
| orr r3, r8, r7 |
| tst r3, #3 |
| beq 2f |
| mov r1, r8 |
| mov r0, r7 |
| mov r12, r11 |
| b 3f |
| 5: tst r3, #3 |
| beq 4f |
| 3: ldrb r2, [r0], #1 |
| subs r12, r12, #1 |
| ldrb r3, [r1] |
| uqadd8 r3, r2, r3 |
| strb r3, [r1], #1 |
| orr r3, r1, r0 |
| bne 5b |
| 1: ldr r3, [sp] |
| add r8, r8, r3 |
| ldr r3, [sp, #40] |
| add r7, r7, r3 |
| 10: subs r10, r10, #1 |
| bcs 6b |
| 0: add sp, sp, #4 |
| pop {r4, r5, r6, r7, r8, r9, r10, r11} |
| bx lr |
| 2: mov r12, r11 |
| mov r1, r8 |
| mov r0, r7 |
| 4: cmp r12, #3 |
| subgt r6, r12, #4 |
| movgt r9, r12 |
| lsrgt r5, r6, #2 |
| addgt r3, r5, #1 |
| movgt r12, #0 |
| lslgt r4, r3, #2 |
| ble 7f |
| 8: ldr r3, [r0, r12] |
| ldr r2, [r1, r12] |
| uqadd8 r3, r3, r2 |
| str r3, [r1, r12] |
| add r12, r12, #4 |
| cmp r12, r4 |
| bne 8b |
| sub r3, r9, #4 |
| bic r3, r3, #3 |
| add r3, r3, #4 |
| subs r12, r6, r5, lsl #2 |
| add r1, r1, r3 |
| add r0, r0, r3 |
| beq 1b |
| 7: mov r4, #0 |
| 9: ldrb r3, [r1, r4] |
| ldrb r2, [r0, r4] |
| uqadd8 r3, r2, r3 |
| strb r3, [r1, r4] |
| add r4, r4, #1 |
| cmp r4, r12 |
| bne 9b |
| ldr r3, [sp] |
| add r8, r8, r3 |
| ldr r3, [sp, #40] |
| add r7, r7, r3 |
| b 10b |
| .endfunc |
| |
| pixman_asm_function pixman_composite_over_8888_8888_asm_armv6 |
| push {r4, r5, r6, r7, r8, r9, r10, r11} |
| sub sp, sp, #20 |
| cmp r1, #0 |
| mov r12, r2 |
| str r1, [sp, #12] |
| str r0, [sp, #16] |
| ldr r2, [sp, #52] |
| beq 0f |
| lsl r3, r3, #2 |
| str r3, [sp] |
| ldr r3, [sp, #56] |
| mov r10, #0 |
| lsl r3, r3, #2 |
| str r3, [sp, #8] |
| mov r11, r3 |
| b 1f |
| 6: ldr r11, [sp, #8] |
| 1: ldr r9, [sp] |
| mov r0, r12 |
| add r12, r12, r9 |
| mov r1, r2 |
| str r12, [sp, #4] |
| add r2, r2, r11 |
| ldr r12, [sp, #16] |
| ldr r3, =0x00800080 |
| ldr r9, =0xff00ff00 |
| mov r11, #255 |
| cmp r12, #0 |
| beq 4f |
| 5: ldr r5, [r1], #4 |
| ldr r4, [r0] |
| sub r8, r11, r5, lsr #24 |
| uxtb16 r6, r4 |
| uxtb16 r7, r4, ror #8 |
| mla r6, r6, r8, r3 |
| mla r7, r7, r8, r3 |
| uxtab16 r6, r6, r6, ror #8 |
| uxtab16 r7, r7, r7, ror #8 |
| and r7, r7, r9 |
| uxtab16 r6, r7, r6, ror #8 |
| uqadd8 r5, r6, r5 |
| str r5, [r0], #4 |
| subs r12, r12, #1 |
| bne 5b |
| 4: ldr r3, [sp, #12] |
| add r10, r10, #1 |
| cmp r10, r3 |
| ldr r12, [sp, #4] |
| bne 6b |
| 0: add sp, sp, #20 |
| pop {r4, r5, r6, r7, r8, r9, r10, r11} |
| bx lr |
| .endfunc |
| |
| pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6 |
| push {r4, r5, r6, r7, r8, r9, r10, r11} |
| sub sp, sp, #28 |
| cmp r1, #0 |
| str r1, [sp, #12] |
| ldrb r1, [sp, #71] |
| mov r12, r2 |
| str r0, [sp, #16] |
| ldr r2, [sp, #60] |
| str r1, [sp, #24] |
| beq 0f |
| lsl r3, r3, #2 |
| str r3, [sp, #20] |
| ldr r3, [sp, #64] |
| mov r10, #0 |
| lsl r3, r3, #2 |
| str r3, [sp, #8] |
| mov r11, r3 |
| b 1f |
| 5: ldr r11, [sp, #8] |
| 1: ldr r4, [sp, #20] |
| mov r0, r12 |
| mov r1, r2 |
| add r12, r12, r4 |
| add r2, r2, r11 |
| str r12, [sp] |
| str r2, [sp, #4] |
| ldr r12, [sp, #16] |
| ldr r2, =0x00800080 |
| ldr r3, [sp, #24] |
| mov r11, #255 |
| cmp r12, #0 |
| beq 3f |
| 4: ldr r5, [r1], #4 |
| ldr r4, [r0] |
| uxtb16 r6, r5 |
| uxtb16 r7, r5, ror #8 |
| mla r6, r6, r3, r2 |
| mla r7, r7, r3, r2 |
| uxtab16 r6, r6, r6, ror #8 |
| uxtab16 r7, r7, r7, ror #8 |
| uxtb16 r6, r6, ror #8 |
| uxtb16 r7, r7, ror #8 |
| orr r5, r6, r7, lsl #8 |
| uxtb16 r6, r4 |
| uxtb16 r7, r4, ror #8 |
| sub r8, r11, r5, lsr #24 |
| mla r6, r6, r8, r2 |
| mla r7, r7, r8, r2 |
| uxtab16 r6, r6, r6, ror #8 |
| uxtab16 r7, r7, r7, ror #8 |
| uxtb16 r6, r6, ror #8 |
| uxtb16 r7, r7, ror #8 |
| orr r6, r6, r7, lsl #8 |
| uqadd8 r5, r6, r5 |
| str r5, [r0], #4 |
| subs r12, r12, #1 |
| bne 4b |
| 3: ldr r1, [sp, #12] |
| add r10, r10, #1 |
| cmp r10, r1 |
| ldr r12, [sp] |
| ldr r2, [sp, #4] |
| bne 5b |
| 0: add sp, sp, #28 |
| pop {r4, r5, r6, r7, r8, r9, r10, r11} |
| bx lr |
| .endfunc |
| |
| pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 |
| push {r4, r5, r6, r7, r8, r9, r10, r11} |
| sub sp, sp, #28 |
| cmp r1, #0 |
| ldr r9, [sp, #60] |
| str r1, [sp, #12] |
| bic r1, r9, #-16777216 |
| str r1, [sp, #20] |
| mov r12, r2 |
| lsr r1, r9, #8 |
| ldr r2, [sp, #20] |
| bic r1, r1, #-16777216 |
| bic r2, r2, #65280 |
| bic r1, r1, #65280 |
| str r2, [sp, #20] |
| str r0, [sp, #16] |
| str r1, [sp, #4] |
| ldr r2, [sp, #68] |
| beq 0f |
| lsl r3, r3, #2 |
| str r3, [sp, #24] |
| mov r0, #0 |
| b 1f |
| 5: ldr r3, [sp, #24] |
| 1: ldr r4, [sp, #72] |
| mov r10, r12 |
| mov r1, r2 |
| add r12, r12, r3 |
| add r2, r2, r4 |
| str r12, [sp, #8] |
| str r2, [sp] |
| ldr r12, [sp, #16] |
| ldr r11, =0x00800080 |
| ldr r2, [sp, #4] |
| ldr r3, [sp, #20] |
| cmp r12, #0 |
| beq 3f |
| 4: ldrb r5, [r1], #1 |
| ldr r4, [r10] |
| mla r6, r3, r5, r11 |
| mla r7, r2, r5, r11 |
| uxtab16 r6, r6, r6, ror #8 |
| uxtab16 r7, r7, r7, ror #8 |
| uxtb16 r6, r6, ror #8 |
| uxtb16 r7, r7, ror #8 |
| orr r5, r6, r7, lsl #8 |
| uxtb16 r6, r4 |
| uxtb16 r7, r4, ror #8 |
| mvn r8, r5 |
| lsr r8, r8, #24 |
| mla r6, r6, r8, r11 |
| mla r7, r7, r8, r11 |
| uxtab16 r6, r6, r6, ror #8 |
| uxtab16 r7, r7, r7, ror #8 |
| uxtb16 r6, r6, ror #8 |
| uxtb16 r7, r7, ror #8 |
| orr r6, r6, r7, lsl #8 |
| uqadd8 r5, r6, r5 |
| str r5, [r10], #4 |
| subs r12, r12, #1 |
| bne 4b |
| 3: ldr r4, [sp, #12] |
| add r0, r0, #1 |
| cmp r0, r4 |
| ldr r12, [sp, #8] |
| ldr r2, [sp] |
| bne 5b |
| 0: add sp, sp, #28 |
| pop {r4, r5, r6, r7, r8, r9, r10, r11} |
| bx lr |
| .endfunc |
| |
| /* |
| * Note: This code is only using armv5te instructions (not even armv6), |
| * but is scheduled for ARM Cortex-A8 pipeline. So it might need to |
| * be split into a few variants, tuned for each microarchitecture. |
| * |
| * TODO: In order to get good performance on ARM9/ARM11 cores (which don't |
| * have efficient write combining), it needs to be changed to use 16-byte |
| * aligned writes using STM instruction. |
| * |
| * Nearest scanline scaler macro template uses the following arguments: |
| * fname - name of the function to generate |
| * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes |
| * t - type suffix for LDR/STR instructions |
| * prefetch_distance - prefetch in the source image by that many |
| * pixels ahead |
| * prefetch_braking_distance - stop prefetching when that many pixels are |
| * remaining before the end of scanline |
| */ |
| |
| .macro generate_nearest_scanline_func fname, bpp_shift, t, \ |
| prefetch_distance, \ |
| prefetch_braking_distance |
| |
| pixman_asm_function fname |
| W .req r0 |
| DST .req r1 |
| SRC .req r2 |
| VX .req r3 |
| UNIT_X .req ip |
| TMP1 .req r4 |
| TMP2 .req r5 |
| VXMASK .req r6 |
| PF_OFFS .req r7 |
| |
| ldr UNIT_X, [sp] |
| push {r4, r5, r6, r7} |
| mvn VXMASK, #((1 << bpp_shift) - 1) |
| |
| /* define helper macro */ |
| .macro scale_2_pixels |
| ldr&t TMP1, [SRC, TMP1] |
| and TMP2, VXMASK, VX, lsr #(16 - bpp_shift) |
| add VX, VX, UNIT_X |
| str&t TMP1, [DST], #(1 << bpp_shift) |
| |
| ldr&t TMP2, [SRC, TMP2] |
| and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) |
| add VX, VX, UNIT_X |
| str&t TMP2, [DST], #(1 << bpp_shift) |
| .endm |
| |
| /* now do the scaling */ |
| and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) |
| add VX, VX, UNIT_X |
| subs W, W, #(8 + prefetch_braking_distance) |
| blt 2f |
| /* calculate prefetch offset */ |
| mov PF_OFFS, #prefetch_distance |
| mla PF_OFFS, UNIT_X, PF_OFFS, VX |
| 1: /* main loop, process 8 pixels per iteration with prefetch */ |
| subs W, W, #8 |
| add PF_OFFS, UNIT_X, lsl #3 |
| scale_2_pixels |
| scale_2_pixels |
| scale_2_pixels |
| scale_2_pixels |
| pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)] |
| bge 1b |
| 2: |
| subs W, W, #(4 - 8 - prefetch_braking_distance) |
| blt 2f |
| 1: /* process the remaining pixels */ |
| scale_2_pixels |
| scale_2_pixels |
| subs W, W, #4 |
| bge 1b |
| 2: |
| tst W, #2 |
| beq 2f |
| scale_2_pixels |
| 2: |
| tst W, #1 |
| ldrne&t TMP1, [SRC, TMP1] |
| strne&t TMP1, [DST] |
| /* cleanup helper macro */ |
| .purgem scale_2_pixels |
| .unreq DST |
| .unreq SRC |
| .unreq W |
| .unreq VX |
| .unreq UNIT_X |
| .unreq TMP1 |
| .unreq TMP2 |
| .unreq VXMASK |
| .unreq PF_OFFS |
| /* return */ |
| pop {r4, r5, r6, r7} |
| bx lr |
| .endfunc |
| .endm |
| |
| generate_nearest_scanline_func \ |
| pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 |
| |
| generate_nearest_scanline_func \ |
| pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32 |