pixman/pixman-arm-simd-asm.S - third_party/pixman - Git at Google

 /*
  * Copyright © 2008 Mozilla Corporation
  * Copyright © 2010 Nokia Corporation
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
  * the above copyright notice appear in all copies and that both that
  * copyright notice and this permission notice appear in supporting
  * documentation, and that the name of Mozilla Corporation not be used in
  * advertising or publicity pertaining to distribution of the software without
  * specific, written prior permission.  Mozilla Corporation makes no
  * representations about the suitability of this software for any purpose.  It
  * is provided "as is" without express or implied warranty.
  *
  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
  *
  * Author:  Jeff Muizelaar (jeff@infidigm.net)
  *
  */

 /* Prevent the stack from becoming executable */
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif

 	.text
 	.arch armv6
 	.object_arch armv4
 	.arm
 	.altmacro
 	.p2align 2

 /* Supplementary macro for setting function attributes */
 .macro pixman_asm_function fname
 	.func fname
 	.global fname
 #ifdef __ELF__
 	.hidden fname
 	.type fname, %function
 #endif
 fname:
 .endm

 /*
  * The code below was generated by gcc 4.3.4 from the commented out
  * functions in 'pixman-arm-simd.c' file with the following optimization
  * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
  *
  * TODO: replace gcc generated code with hand tuned versions because
  * the code quality is not very good, introduce symbolic register
  * aliases for better readability and maintainability.
  */

 pixman_asm_function pixman_composite_add_8_8_asm_armv6
 	push	{r4, r5, r6, r7, r8, r9, r10, r11}
 	mov	r10, r1
 	sub	sp, sp, #4
 	subs	r10, r10, #1
 	mov	r11, r0
 	mov	r8, r2
 	str	r3, [sp]
 	ldr	r7, [sp, #36]
 	bcc	0f
 6:	cmp	r11, #0
 	beq	1f
 	orr	r3, r8, r7
 	tst	r3, #3
 	beq	2f
 	mov	r1, r8
 	mov	r0, r7
 	mov	r12, r11
 	b	3f
 5:	tst	r3, #3
 	beq	4f
 3:	ldrb	r2, [r0], #1
 	subs	r12, r12, #1
 	ldrb	r3, [r1]
 	uqadd8	r3, r2, r3
 	strb	r3, [r1], #1
 	orr	r3, r1, r0
 	bne	5b
 1:	ldr	r3, [sp]
 	add	r8, r8, r3
 	ldr	r3, [sp, #40]
 	add	r7, r7, r3
 10:	subs	r10, r10, #1
 	bcs	6b
 0:	add	sp, sp, #4
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
 	bx	lr
 2:	mov	r12, r11
 	mov	r1, r8
 	mov	r0, r7
 4:	cmp	r12, #3
 	subgt	r6, r12, #4
 	movgt	r9, r12
 	lsrgt	r5, r6, #2
 	addgt	r3, r5, #1
 	movgt	r12, #0
 	lslgt	r4, r3, #2
 	ble	7f
 8:	ldr	r3, [r0, r12]
 	ldr	r2, [r1, r12]
 	uqadd8	r3, r3, r2
 	str	r3, [r1, r12]
 	add	r12, r12, #4
 	cmp	r12, r4
 	bne	8b
 	sub	r3, r9, #4
 	bic	r3, r3, #3
 	add	r3, r3, #4
 	subs	r12, r6, r5, lsl #2
 	add	r1, r1, r3
 	add	r0, r0, r3
 	beq	1b
 7:	mov	r4, #0
 9:	ldrb	r3, [r1, r4]
 	ldrb	r2, [r0, r4]
 	uqadd8	r3, r2, r3
 	strb	r3, [r1, r4]
 	add	r4, r4, #1
 	cmp	r4, r12
 	bne	9b
 	ldr	r3, [sp]
 	add	r8, r8, r3
 	ldr	r3, [sp, #40]
 	add	r7, r7, r3
 	b	10b
 .endfunc

 pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
 	push	{r4, r5, r6, r7, r8, r9, r10, r11}
 	sub	sp, sp, #20
 	cmp	r1, #0
 	mov	r12, r2
 	str	r1, [sp, #12]
 	str	r0, [sp, #16]
 	ldr	r2, [sp, #52]
 	beq	0f
 	lsl	r3, r3, #2
 	str	r3, [sp]
 	ldr	r3, [sp, #56]
 	mov	r10, #0
 	lsl	r3, r3, #2
 	str	r3, [sp, #8]
 	mov	r11, r3
 	b	1f
 6:	ldr	r11, [sp, #8]
 1:	ldr	r9, [sp]
 	mov	r0, r12
 	add	r12, r12, r9
 	mov	r1, r2
 	str	r12, [sp, #4]
 	add	r2, r2, r11
 	ldr	r12, [sp, #16]
 	ldr	r3, =0x00800080
 	ldr	r9, =0xff00ff00
 	mov	r11, #255
 	cmp	r12, #0
 	beq	4f
 5:	ldr	r5, [r1], #4
 	ldr	r4, [r0]
 	sub	r8, r11, r5, lsr #24
 	uxtb16	r6, r4
 	uxtb16	r7, r4, ror #8
 	mla	r6, r6, r8, r3
 	mla	r7, r7, r8, r3
 	uxtab16	r6, r6, r6, ror #8
 	uxtab16	r7, r7, r7, ror #8
 	and	r7, r7, r9
 	uxtab16	r6, r7, r6, ror #8
 	uqadd8	r5, r6, r5
 	str	r5, [r0], #4
 	subs	r12, r12, #1
 	bne	5b
 4:	ldr	r3, [sp, #12]
 	add	r10, r10, #1
 	cmp	r10, r3
 	ldr	r12, [sp, #4]
 	bne	6b
 0:	add	sp, sp, #20
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
 	bx	lr
 .endfunc

 pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
 	push	{r4, r5, r6, r7, r8, r9, r10, r11}
 	sub	sp, sp, #28
 	cmp	r1, #0
 	str	r1, [sp, #12]
 	ldrb	r1, [sp, #71]
 	mov	r12, r2
 	str	r0, [sp, #16]
 	ldr	r2, [sp, #60]
 	str	r1, [sp, #24]
 	beq	0f
 	lsl	r3, r3, #2
 	str	r3, [sp, #20]
 	ldr	r3, [sp, #64]
 	mov	r10, #0
 	lsl	r3, r3, #2
 	str	r3, [sp, #8]
 	mov	r11, r3
 	b	1f
 5:	ldr	r11, [sp, #8]
 1:	ldr	r4, [sp, #20]
 	mov	r0, r12
 	mov	r1, r2
 	add	r12, r12, r4
 	add	r2, r2, r11
 	str	r12, [sp]
 	str	r2, [sp, #4]
 	ldr	r12, [sp, #16]
 	ldr	r2, =0x00800080
 	ldr	r3, [sp, #24]
 	mov	r11, #255
 	cmp	r12, #0
 	beq	3f
 4:	ldr	r5, [r1], #4
 	ldr	r4, [r0]
 	uxtb16	r6, r5
 	uxtb16	r7, r5, ror #8
 	mla	r6, r6, r3, r2
 	mla	r7, r7, r3, r2
 	uxtab16	r6, r6, r6, ror #8
 	uxtab16	r7, r7, r7, ror #8
 	uxtb16	r6, r6, ror #8
 	uxtb16	r7, r7, ror #8
 	orr	r5, r6, r7, lsl #8
 	uxtb16	r6, r4
 	uxtb16	r7, r4, ror #8
 	sub	r8, r11, r5, lsr #24
 	mla	r6, r6, r8, r2
 	mla	r7, r7, r8, r2
 	uxtab16	r6, r6, r6, ror #8
 	uxtab16	r7, r7, r7, ror #8
 	uxtb16	r6, r6, ror #8
 	uxtb16	r7, r7, ror #8
 	orr	r6, r6, r7, lsl #8
 	uqadd8	r5, r6, r5
 	str	r5, [r0], #4
 	subs	r12, r12, #1
 	bne	4b
 3:	ldr	r1, [sp, #12]
 	add	r10, r10, #1
 	cmp	r10, r1
 	ldr	r12, [sp]
 	ldr	r2, [sp, #4]
 	bne	5b
 0:	add	sp, sp, #28
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
 	bx	lr
 .endfunc

 pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
 	push	{r4, r5, r6, r7, r8, r9, r10, r11}
 	sub	sp, sp, #28
 	cmp	r1, #0
 	ldr	r9, [sp, #60]
 	str	r1, [sp, #12]
 	bic	r1, r9, #-16777216
 	str	r1, [sp, #20]
 	mov	r12, r2
 	lsr	r1, r9, #8
 	ldr	r2, [sp, #20]
 	bic	r1, r1, #-16777216
 	bic	r2, r2, #65280
 	bic	r1, r1, #65280
 	str	r2, [sp, #20]
 	str	r0, [sp, #16]
 	str	r1, [sp, #4]
 	ldr	r2, [sp, #68]
 	beq	0f
 	lsl	r3, r3, #2
 	str	r3, [sp, #24]
 	mov	r0, #0
 	b	1f
 5:	ldr	r3, [sp, #24]
 1:	ldr	r4, [sp, #72]
 	mov	r10, r12
 	mov	r1, r2
 	add	r12, r12, r3
 	add	r2, r2, r4
 	str	r12, [sp, #8]
 	str	r2, [sp]
 	ldr	r12, [sp, #16]
 	ldr	r11, =0x00800080
 	ldr	r2, [sp, #4]
 	ldr	r3, [sp, #20]
 	cmp	r12, #0
 	beq	3f
 4:	ldrb	r5, [r1], #1
 	ldr	r4, [r10]
 	mla	r6, r3, r5, r11
 	mla	r7, r2, r5, r11
 	uxtab16	r6, r6, r6, ror #8
 	uxtab16	r7, r7, r7, ror #8
 	uxtb16	r6, r6, ror #8
 	uxtb16	r7, r7, ror #8
 	orr	r5, r6, r7, lsl #8
 	uxtb16	r6, r4
 	uxtb16	r7, r4, ror #8
 	mvn	r8, r5
 	lsr	r8, r8, #24
 	mla	r6, r6, r8, r11
 	mla	r7, r7, r8, r11
 	uxtab16	r6, r6, r6, ror #8
 	uxtab16	r7, r7, r7, ror #8
 	uxtb16	r6, r6, ror #8
 	uxtb16	r7, r7, ror #8
 	orr	r6, r6, r7, lsl #8
 	uqadd8	r5, r6, r5
 	str	r5, [r10], #4
 	subs	r12, r12, #1
 	bne	4b
 3:	ldr	r4, [sp, #12]
 	add	r0, r0, #1
 	cmp	r0, r4
 	ldr	r12, [sp, #8]
 	ldr	r2, [sp]
 	bne	5b
 0:	add	sp, sp, #28
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
 	bx	lr
 .endfunc

 /*
  * Note: This code is only using armv5te instructions (not even armv6),
  *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
  *       be split into a few variants, tuned for each microarchitecture.
  *
  * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
  * have efficient write combining), it needs to be changed to use 16-byte
  * aligned writes using STM instruction.
  *
  * Nearest scanline scaler macro template uses the following arguments:
  *  fname                     - name of the function to generate
  *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
  *  t                         - type suffix for LDR/STR instructions
  *  prefetch_distance         - prefetch in the source image by that many
  *                              pixels ahead
  *  prefetch_braking_distance - stop prefetching when that many pixels are
  *                              remaining before the end of scanline
  */

 .macro generate_nearest_scanline_func fname, bpp_shift, t,      \
                                       prefetch_distance,        \
                                       prefetch_braking_distance

 pixman_asm_function fname
 	W	.req	r0
 	DST	.req	r1
 	SRC	.req	r2
 	VX	.req	r3
 	UNIT_X	.req	ip
 	TMP1	.req	r4
 	TMP2	.req	r5
 	VXMASK	.req	r6
 	PF_OFFS	.req	r7

 	ldr	UNIT_X, [sp]
 	push	{r4, r5, r6, r7}
 	mvn	VXMASK, #((1 << bpp_shift) - 1)

 	/* define helper macro */
 	.macro	scale_2_pixels
 		ldr&t	TMP1, [SRC, TMP1]
 		and	TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
 		add	VX, VX, UNIT_X
 		str&t	TMP1, [DST], #(1 << bpp_shift)

 		ldr&t	TMP2, [SRC, TMP2]
 		and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
 		add	VX, VX, UNIT_X
 		str&t	TMP2, [DST], #(1 << bpp_shift)
 	.endm

 	/* now do the scaling */
 	and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
 	add	VX, VX, UNIT_X
 	subs	W, W, #(8 + prefetch_braking_distance)
 	blt	2f
 	/* calculate prefetch offset */
 	mov	PF_OFFS, #prefetch_distance
 	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
 1:	/* main loop, process 8 pixels per iteration with prefetch */
 	subs	W, W, #8
 	add	PF_OFFS, UNIT_X, lsl #3
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
 	pld	[SRC, PF_OFFS, lsr #(16 - bpp_shift)]
 	bge	1b
 2:
 	subs	W, W, #(4 - 8 - prefetch_braking_distance)
 	blt	2f
 1:	/* process the remaining pixels */
 	scale_2_pixels
 	scale_2_pixels
 	subs	W, W, #4
 	bge	1b
 2:
 	tst	W, #2
 	beq	2f
 	scale_2_pixels
 2:
 	tst	W, #1
 	ldrne&t	TMP1, [SRC, TMP1]
 	strne&t	TMP1, [DST]
 	/* cleanup helper macro */
 	.purgem	scale_2_pixels
 	.unreq	DST
 	.unreq	SRC
 	.unreq	W
 	.unreq	VX
 	.unreq	UNIT_X
 	.unreq	TMP1
 	.unreq	TMP2
 	.unreq	VXMASK
 	.unreq	PF_OFFS
 	/* return */
 	pop	{r4, r5, r6, r7}
 	bx	lr
 .endfunc
 .endm

 generate_nearest_scanline_func \
     pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32

 generate_nearest_scanline_func \
     pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
	/*
	* Copyright © 2008 Mozilla Corporation
	* Copyright © 2010 Nokia Corporation
	*
	* Permission to use, copy, modify, distribute, and sell this software and its
	* documentation for any purpose is hereby granted without fee, provided that
	* the above copyright notice appear in all copies and that both that
	* copyright notice and this permission notice appear in supporting
	* documentation, and that the name of Mozilla Corporation not be used in
	* advertising or publicity pertaining to distribution of the software without
	* specific, written prior permission. Mozilla Corporation makes no
	* representations about the suitability of this software for any purpose. It
	* is provided "as is" without express or implied warranty.
	*
	* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
	* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
	* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
	* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
	* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
	* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
	* SOFTWARE.
	*
	* Author: Jeff Muizelaar (jeff@infidigm.net)
	*
	*/

	/* Prevent the stack from becoming executable */
	#if defined(__linux__) && defined(__ELF__)
	.section .note.GNU-stack,"",%progbits
	#endif

	.text
	.arch armv6
	.object_arch armv4
	.arm
	.altmacro
	.p2align 2

	/* Supplementary macro for setting function attributes */
	.macro pixman_asm_function fname
	.func fname
	.global fname
	#ifdef __ELF__
	.hidden fname
	.type fname, %function
	#endif
	fname:
	.endm

	/*
	* The code below was generated by gcc 4.3.4 from the commented out
	* functions in 'pixman-arm-simd.c' file with the following optimization
	* options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
	*
	* TODO: replace gcc generated code with hand tuned versions because
	* the code quality is not very good, introduce symbolic register
	* aliases for better readability and maintainability.
	*/

	pixman_asm_function pixman_composite_add_8_8_asm_armv6
	push {r4, r5, r6, r7, r8, r9, r10, r11}
	mov r10, r1
	sub sp, sp, #4
	subs r10, r10, #1
	mov r11, r0
	mov r8, r2
	str r3, [sp]
	ldr r7, [sp, #36]
	bcc 0f
	6: cmp r11, #0
	beq 1f
	orr r3, r8, r7
	tst r3, #3
	beq 2f
	mov r1, r8
	mov r0, r7
	mov r12, r11
	b 3f
	5: tst r3, #3
	beq 4f
	3: ldrb r2, [r0], #1
	subs r12, r12, #1
	ldrb r3, [r1]
	uqadd8 r3, r2, r3
	strb r3, [r1], #1
	orr r3, r1, r0
	bne 5b
	1: ldr r3, [sp]
	add r8, r8, r3
	ldr r3, [sp, #40]
	add r7, r7, r3
	10: subs r10, r10, #1
	bcs 6b
	0: add sp, sp, #4
	pop {r4, r5, r6, r7, r8, r9, r10, r11}
	bx lr
	2: mov r12, r11
	mov r1, r8
	mov r0, r7
	4: cmp r12, #3
	subgt r6, r12, #4
	movgt r9, r12
	lsrgt r5, r6, #2
	addgt r3, r5, #1
	movgt r12, #0
	lslgt r4, r3, #2
	ble 7f
	8: ldr r3, [r0, r12]
	ldr r2, [r1, r12]
	uqadd8 r3, r3, r2
	str r3, [r1, r12]
	add r12, r12, #4
	cmp r12, r4
	bne 8b
	sub r3, r9, #4
	bic r3, r3, #3
	add r3, r3, #4
	subs r12, r6, r5, lsl #2
	add r1, r1, r3
	add r0, r0, r3
	beq 1b
	7: mov r4, #0
	9: ldrb r3, [r1, r4]
	ldrb r2, [r0, r4]
	uqadd8 r3, r2, r3
	strb r3, [r1, r4]
	add r4, r4, #1
	cmp r4, r12
	bne 9b
	ldr r3, [sp]
	add r8, r8, r3
	ldr r3, [sp, #40]
	add r7, r7, r3
	b 10b
	.endfunc

	pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
	push {r4, r5, r6, r7, r8, r9, r10, r11}
	sub sp, sp, #20
	cmp r1, #0
	mov r12, r2
	str r1, [sp, #12]
	str r0, [sp, #16]
	ldr r2, [sp, #52]
	beq 0f
	lsl r3, r3, #2
	str r3, [sp]
	ldr r3, [sp, #56]
	mov r10, #0
	lsl r3, r3, #2
	str r3, [sp, #8]
	mov r11, r3
	b 1f
	6: ldr r11, [sp, #8]
	1: ldr r9, [sp]
	mov r0, r12
	add r12, r12, r9
	mov r1, r2
	str r12, [sp, #4]
	add r2, r2, r11
	ldr r12, [sp, #16]
	ldr r3, =0x00800080
	ldr r9, =0xff00ff00
	mov r11, #255
	cmp r12, #0
	beq 4f
	5: ldr r5, [r1], #4
	ldr r4, [r0]
	sub r8, r11, r5, lsr #24
	uxtb16 r6, r4
	uxtb16 r7, r4, ror #8
	mla r6, r6, r8, r3
	mla r7, r7, r8, r3
	uxtab16 r6, r6, r6, ror #8
	uxtab16 r7, r7, r7, ror #8
	and r7, r7, r9
	uxtab16 r6, r7, r6, ror #8
	uqadd8 r5, r6, r5
	str r5, [r0], #4
	subs r12, r12, #1
	bne 5b
	4: ldr r3, [sp, #12]
	add r10, r10, #1
	cmp r10, r3
	ldr r12, [sp, #4]
	bne 6b
	0: add sp, sp, #20
	pop {r4, r5, r6, r7, r8, r9, r10, r11}
	bx lr
	.endfunc

	pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
	push {r4, r5, r6, r7, r8, r9, r10, r11}
	sub sp, sp, #28
	cmp r1, #0
	str r1, [sp, #12]
	ldrb r1, [sp, #71]
	mov r12, r2
	str r0, [sp, #16]
	ldr r2, [sp, #60]
	str r1, [sp, #24]
	beq 0f
	lsl r3, r3, #2
	str r3, [sp, #20]
	ldr r3, [sp, #64]
	mov r10, #0
	lsl r3, r3, #2
	str r3, [sp, #8]
	mov r11, r3
	b 1f
	5: ldr r11, [sp, #8]
	1: ldr r4, [sp, #20]
	mov r0, r12
	mov r1, r2
	add r12, r12, r4
	add r2, r2, r11
	str r12, [sp]
	str r2, [sp, #4]
	ldr r12, [sp, #16]
	ldr r2, =0x00800080
	ldr r3, [sp, #24]
	mov r11, #255
	cmp r12, #0
	beq 3f
	4: ldr r5, [r1], #4
	ldr r4, [r0]
	uxtb16 r6, r5
	uxtb16 r7, r5, ror #8
	mla r6, r6, r3, r2
	mla r7, r7, r3, r2
	uxtab16 r6, r6, r6, ror #8
	uxtab16 r7, r7, r7, ror #8
	uxtb16 r6, r6, ror #8
	uxtb16 r7, r7, ror #8
	orr r5, r6, r7, lsl #8
	uxtb16 r6, r4
	uxtb16 r7, r4, ror #8
	sub r8, r11, r5, lsr #24
	mla r6, r6, r8, r2
	mla r7, r7, r8, r2
	uxtab16 r6, r6, r6, ror #8
	uxtab16 r7, r7, r7, ror #8
	uxtb16 r6, r6, ror #8
	uxtb16 r7, r7, ror #8
	orr r6, r6, r7, lsl #8
	uqadd8 r5, r6, r5
	str r5, [r0], #4
	subs r12, r12, #1
	bne 4b
	3: ldr r1, [sp, #12]
	add r10, r10, #1
	cmp r10, r1
	ldr r12, [sp]
	ldr r2, [sp, #4]
	bne 5b
	0: add sp, sp, #28
	pop {r4, r5, r6, r7, r8, r9, r10, r11}
	bx lr
	.endfunc

	pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
	push {r4, r5, r6, r7, r8, r9, r10, r11}
	sub sp, sp, #28
	cmp r1, #0
	ldr r9, [sp, #60]
	str r1, [sp, #12]
	bic r1, r9, #-16777216
	str r1, [sp, #20]
	mov r12, r2
	lsr r1, r9, #8
	ldr r2, [sp, #20]
	bic r1, r1, #-16777216
	bic r2, r2, #65280
	bic r1, r1, #65280
	str r2, [sp, #20]
	str r0, [sp, #16]
	str r1, [sp, #4]
	ldr r2, [sp, #68]
	beq 0f
	lsl r3, r3, #2
	str r3, [sp, #24]
	mov r0, #0
	b 1f
	5: ldr r3, [sp, #24]
	1: ldr r4, [sp, #72]
	mov r10, r12
	mov r1, r2
	add r12, r12, r3
	add r2, r2, r4
	str r12, [sp, #8]
	str r2, [sp]
	ldr r12, [sp, #16]
	ldr r11, =0x00800080
	ldr r2, [sp, #4]
	ldr r3, [sp, #20]
	cmp r12, #0
	beq 3f
	4: ldrb r5, [r1], #1
	ldr r4, [r10]
	mla r6, r3, r5, r11
	mla r7, r2, r5, r11
	uxtab16 r6, r6, r6, ror #8
	uxtab16 r7, r7, r7, ror #8
	uxtb16 r6, r6, ror #8
	uxtb16 r7, r7, ror #8
	orr r5, r6, r7, lsl #8
	uxtb16 r6, r4
	uxtb16 r7, r4, ror #8
	mvn r8, r5
	lsr r8, r8, #24
	mla r6, r6, r8, r11
	mla r7, r7, r8, r11
	uxtab16 r6, r6, r6, ror #8
	uxtab16 r7, r7, r7, ror #8
	uxtb16 r6, r6, ror #8
	uxtb16 r7, r7, ror #8
	orr r6, r6, r7, lsl #8
	uqadd8 r5, r6, r5
	str r5, [r10], #4
	subs r12, r12, #1
	bne 4b
	3: ldr r4, [sp, #12]
	add r0, r0, #1
	cmp r0, r4
	ldr r12, [sp, #8]
	ldr r2, [sp]
	bne 5b
	0: add sp, sp, #28
	pop {r4, r5, r6, r7, r8, r9, r10, r11}
	bx lr
	.endfunc

	/*
	* Note: This code is only using armv5te instructions (not even armv6),
	* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
	* be split into a few variants, tuned for each microarchitecture.
	*
	* TODO: In order to get good performance on ARM9/ARM11 cores (which don't
	* have efficient write combining), it needs to be changed to use 16-byte
	* aligned writes using STM instruction.
	*
	* Nearest scanline scaler macro template uses the following arguments:
	* fname - name of the function to generate
	* bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
	* t - type suffix for LDR/STR instructions
	* prefetch_distance - prefetch in the source image by that many
	* pixels ahead
	* prefetch_braking_distance - stop prefetching when that many pixels are
	* remaining before the end of scanline
	*/

	.macro generate_nearest_scanline_func fname, bpp_shift, t, \
	prefetch_distance, \
	prefetch_braking_distance

	pixman_asm_function fname
	W .req r0
	DST .req r1
	SRC .req r2
	VX .req r3
	UNIT_X .req ip
	TMP1 .req r4
	TMP2 .req r5
	VXMASK .req r6
	PF_OFFS .req r7

	ldr UNIT_X, [sp]
	push {r4, r5, r6, r7}
	mvn VXMASK, #((1 << bpp_shift) - 1)

	/* define helper macro */
	.macro scale_2_pixels
	ldr&t TMP1, [SRC, TMP1]
	and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
	add VX, VX, UNIT_X
	str&t TMP1, [DST], #(1 << bpp_shift)

	ldr&t TMP2, [SRC, TMP2]
	and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
	add VX, VX, UNIT_X
	str&t TMP2, [DST], #(1 << bpp_shift)
	.endm

	/* now do the scaling */
	and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
	add VX, VX, UNIT_X
	subs W, W, #(8 + prefetch_braking_distance)
	blt 2f
	/* calculate prefetch offset */
	mov PF_OFFS, #prefetch_distance
	mla PF_OFFS, UNIT_X, PF_OFFS, VX
	1: /* main loop, process 8 pixels per iteration with prefetch */
	subs W, W, #8
	add PF_OFFS, UNIT_X, lsl #3
	scale_2_pixels
	scale_2_pixels
	scale_2_pixels
	scale_2_pixels
	pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
	bge 1b
	2:
	subs W, W, #(4 - 8 - prefetch_braking_distance)
	blt 2f
	1: /* process the remaining pixels */
	scale_2_pixels
	scale_2_pixels
	subs W, W, #4
	bge 1b
	2:
	tst W, #2
	beq 2f
	scale_2_pixels
	2:
	tst W, #1
	ldrne&t TMP1, [SRC, TMP1]
	strne&t TMP1, [DST]
	/* cleanup helper macro */
	.purgem scale_2_pixels
	.unreq DST
	.unreq SRC
	.unreq W
	.unreq VX
	.unreq UNIT_X
	.unreq TMP1
	.unreq TMP2
	.unreq VXMASK
	.unreq PF_OFFS
	/* return */
	pop {r4, r5, r6, r7}
	bx lr
	.endfunc
	.endm

	generate_nearest_scanline_func \
	pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32

	generate_nearest_scanline_func \
	pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32