pixman/pixman-arm-simd.c - third_party/pixman - Git at Google

 /*
  * Copyright © 2008 Mozilla Corporation
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
  * the above copyright notice appear in all copies and that both that
  * copyright notice and this permission notice appear in supporting
  * documentation, and that the name of Mozilla Corporation not be used in
  * advertising or publicity pertaining to distribution of the software without
  * specific, written prior permission.  Mozilla Corporation makes no
  * representations about the suitability of this software for any purpose.  It
  * is provided "as is" without express or implied warranty.
  *
  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
  *
  * Author:  Jeff Muizelaar (jeff@infidigm.net)
  *
  */
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif

 #include "pixman-private.h"

 static void
 arm_composite_add_8000_8000 (pixman_implementation_t * impl,
     pixman_op_t               op,
     pixman_image_t *          src_image,
     pixman_image_t *          mask_image,
     pixman_image_t *          dst_image,
     int32_t                   src_x,
     int32_t                   src_y,
     int32_t                   mask_x,
     int32_t                   mask_y,
     int32_t                   dest_x,
     int32_t                   dest_y,
     int32_t                   width,
     int32_t                   height)
 {
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int dst_stride, src_stride;
     uint16_t w;
     uint8_t s, d;

     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);

     while (height--)
     {
 	dst = dst_line;
 	dst_line += dst_stride;
 	src = src_line;
 	src_line += src_stride;
 	w = width;

 	/* ensure both src and dst are properly aligned before doing 32 bit reads
 	 * we'll stay in this loop if src and dst have differing alignments
 	 */
 	while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
 	{
 	    s = *src;
 	    d = *dst;
 	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
 	    *dst = d;

 	    dst++;
 	    src++;
 	    w--;
 	}

 	while (w >= 4)
 	{
 	    asm ("uqadd8 %0, %1, %2"
 		 : "=r" (*(uint32_t*)dst)
 		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
 	    dst += 4;
 	    src += 4;
 	    w -= 4;
 	}

 	while (w)
 	{
 	    s = *src;
 	    d = *dst;
 	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
 	    *dst = d;

 	    dst++;
 	    src++;
 	    w--;
 	}
     }

 }

 static void
 arm_composite_over_8888_8888 (pixman_implementation_t * impl,
     pixman_op_t               op,
     pixman_image_t *          src_image,
     pixman_image_t *          mask_image,
     pixman_image_t *          dst_image,
     int32_t                   src_x,
     int32_t                   src_y,
     int32_t                   mask_x,
     int32_t                   mask_y,
     int32_t                   dest_x,
     int32_t                   dest_y,
     int32_t                   width,
     int32_t                   height)
 {
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
     uint16_t w;
     uint32_t component_half = 0x800080;
     uint32_t upper_component_mask = 0xff00ff00;
     uint32_t alpha_mask = 0xff;

     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

     while (height--)
     {
 	dst = dst_line;
 	dst_line += dst_stride;
 	src = src_line;
 	src_line += src_stride;
 	w = width;

 /* #define inner_branch */
 	asm volatile (
 	    "cmp %[w], #0\n\t"
 	    "beq 2f\n\t"
 	    "1:\n\t"
 	    /* load src */
 	    "ldr r5, [%[src]], #4\n\t"
 #ifdef inner_branch
 	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 	     * The 0x0 case also allows us to avoid doing an unecessary data
 	     * write which is more valuable so we only check for that
 	     */
 	    "cmp r5, #0\n\t"
 	    "beq 3f\n\t"

 	    /* = 255 - alpha */
 	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"

 	    "ldr r4, [%[dest]] \n\t"

 #else
 	    "ldr r4, [%[dest]] \n\t"

 	    /* = 255 - alpha */
 	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 #endif
 	    "uxtb16 r6, r4\n\t"
 	    "uxtb16 r7, r4, ror #8\n\t"

 	    /* multiply by 257 and divide by 65536 */
 	    "mla r6, r6, r8, %[component_half]\n\t"
 	    "mla r7, r7, r8, %[component_half]\n\t"

 	    "uxtab16 r6, r6, r6, ror #8\n\t"
 	    "uxtab16 r7, r7, r7, ror #8\n\t"

 	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
 	    "and r7, r7, %[upper_component_mask]\n\t"
 	    "uxtab16 r6, r7, r6, ror #8\n\t"

 	    "uqadd8 r5, r6, r5\n\t"

 #ifdef inner_branch
 	    "3:\n\t"

 #endif
 	    "str r5, [%[dest]], #4\n\t"
 	    /* increment counter and jmp to top */
 	    "subs	%[w], %[w], #1\n\t"
 	    "bne	1b\n\t"
 	    "2:\n\t"
 	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
 	    [alpha_mask] "r" (alpha_mask)
 	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 	    );
     }
 }

 static void
 arm_composite_over_8888_n_8888 (
     pixman_implementation_t * impl,
     pixman_op_t               op,
     pixman_image_t *          src_image,
     pixman_image_t *          mask_image,
     pixman_image_t *          dst_image,
     int32_t                   src_x,
     int32_t                   src_y,
     int32_t                   mask_x,
     int32_t                   mask_y,
     int32_t                   dest_x,
     int32_t                   dest_y,
     int32_t                   width,
     int32_t                   height)
 {
     uint32_t *dst_line, *dst;
     uint32_t *src_line, *src;
     uint32_t mask;
     int dst_stride, src_stride;
     uint16_t w;
     uint32_t component_half = 0x800080;
     uint32_t alpha_mask = 0xff;

     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
     mask = (mask) >> 24;

     while (height--)
     {
 	dst = dst_line;
 	dst_line += dst_stride;
 	src = src_line;
 	src_line += src_stride;
 	w = width;

 /* #define inner_branch */
 	asm volatile (
 	    "cmp %[w], #0\n\t"
 	    "beq 2f\n\t"
 	    "1:\n\t"
 	    /* load src */
 	    "ldr r5, [%[src]], #4\n\t"
 #ifdef inner_branch
 	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 	     * The 0x0 case also allows us to avoid doing an unecessary data
 	     * write which is more valuable so we only check for that
 	     */
 	    "cmp r5, #0\n\t"
 	    "beq 3f\n\t"

 #endif
 	    "ldr r4, [%[dest]] \n\t"

 	    "uxtb16 r6, r5\n\t"
 	    "uxtb16 r7, r5, ror #8\n\t"

 	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
 	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
 	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"

 	    "uxtab16 r6, r6, r6, ror #8\n\t"
 	    "uxtab16 r7, r7, r7, ror #8\n\t"

 	    "uxtb16 r6, r6, ror #8\n\t"
 	    "uxtb16 r7, r7, ror #8\n\t"

 	    /* recombine */
 	    "orr r5, r6, r7, lsl #8\n\t"

 	    "uxtb16 r6, r4\n\t"
 	    "uxtb16 r7, r4, ror #8\n\t"

 	    /* 255 - alpha */
 	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"

 	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
 	    "mla r6, r6, r8, %[component_half]\n\t"
 	    "mla r7, r7, r8, %[component_half]\n\t"

 	    "uxtab16 r6, r6, r6, ror #8\n\t"
 	    "uxtab16 r7, r7, r7, ror #8\n\t"

 	    "uxtb16 r6, r6, ror #8\n\t"
 	    "uxtb16 r7, r7, ror #8\n\t"

 	    /* recombine */
 	    "orr r6, r6, r7, lsl #8\n\t"

 	    "uqadd8 r5, r6, r5\n\t"

 #ifdef inner_branch
 	    "3:\n\t"

 #endif
 	    "str r5, [%[dest]], #4\n\t"
 	    /* increment counter and jmp to top */
 	    "subs	%[w], %[w], #1\n\t"
 	    "bne	1b\n\t"
 	    "2:\n\t"
 	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
 	    [alpha_mask] "r" (alpha_mask)
 	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
 	    );
     }
 }

 static void
 arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
 			     pixman_op_t               op,
 			     pixman_image_t *          src_image,
 			     pixman_image_t *          mask_image,
 			     pixman_image_t *          dst_image,
 			     int32_t                   src_x,
 			     int32_t                   src_y,
 			     int32_t                   mask_x,
 			     int32_t                   mask_y,
 			     int32_t                   dest_x,
 			     int32_t                   dest_y,
 			     int32_t                   width,
 			     int32_t                   height)
 {
     uint32_t src, srca;
     uint32_t *dst_line, *dst;
     uint8_t  *mask_line, *mask;
     int dst_stride, mask_stride;
     uint16_t w;

     src = _pixman_image_get_solid (src_image, dst_image->bits.format);

     /* bail out if fully transparent */
     srca = src >> 24;
     if (src == 0)
 	return;

     uint32_t component_mask = 0xff00ff;
     uint32_t component_half = 0x800080;

     uint32_t src_hi = (src >> 8) & component_mask;
     uint32_t src_lo = src & component_mask;

     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

     while (height--)
     {
 	dst = dst_line;
 	dst_line += dst_stride;
 	mask = mask_line;
 	mask_line += mask_stride;
 	w = width;

 /* #define inner_branch */
 	asm volatile (
 	    "cmp %[w], #0\n\t"
 	    "beq 2f\n\t"
 	    "1:\n\t"
 	    /* load mask */
 	    "ldrb r5, [%[mask]], #1\n\t"
 #ifdef inner_branch
 	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
 	     * The 0x0 case also allows us to avoid doing an unecessary data
 	     * write which is more valuable so we only check for that
 	     */
 	    "cmp r5, #0\n\t"
 	    "beq 3f\n\t"

 #endif
 	    "ldr r4, [%[dest]] \n\t"

 	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
 	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
 	    "mla r7, %[src_hi], r5, %[component_half]\n\t"

 	    "uxtab16 r6, r6, r6, ror #8\n\t"
 	    "uxtab16 r7, r7, r7, ror #8\n\t"

 	    "uxtb16 r6, r6, ror #8\n\t"
 	    "uxtb16 r7, r7, ror #8\n\t"

 	    /* recombine */
 	    "orr r5, r6, r7, lsl #8\n\t"

 	    "uxtb16 r6, r4\n\t"
 	    "uxtb16 r7, r4, ror #8\n\t"

 	    /* we could simplify this to use 'sub' if we were
 	    * willing to give up a register for alpha_mask */
 	    "mvn r8, r5\n\t"
 	    "mov r8, r8, lsr #24\n\t"

 	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
 	    "mla r6, r6, r8, %[component_half]\n\t"
 	    "mla r7, r7, r8, %[component_half]\n\t"

 	    "uxtab16 r6, r6, r6, ror #8\n\t"
 	    "uxtab16 r7, r7, r7, ror #8\n\t"

 	    "uxtb16 r6, r6, ror #8\n\t"
 	    "uxtb16 r7, r7, ror #8\n\t"

 	    /* recombine */
 	    "orr r6, r6, r7, lsl #8\n\t"

 	    "uqadd8 r5, r6, r5\n\t"

 #ifdef inner_branch
 	    "3:\n\t"

 #endif
 	    "str r5, [%[dest]], #4\n\t"
 	    /* increment counter and jmp to top */
 	    "subs	%[w], %[w], #1\n\t"
 	    "bne	1b\n\t"
 	    "2:\n\t"
 	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
 	    : [component_half] "r" (component_half),
 	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
 	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
     }
 }

 static const pixman_fast_path_t arm_simd_fast_path_array[] =
 {
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, arm_composite_over_8888_8888,    0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, arm_composite_over_8888_8888,    0 },
     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, arm_composite_over_8888_8888,    0 },
     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, arm_composite_over_8888_8888,    0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888,  NEED_SOLID_MASK },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888,  NEED_SOLID_MASK },

     { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       arm_composite_add_8000_8000,     0 },

     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_n_8_8888,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_n_8_8888,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, arm_composite_over_n_8_8888,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, arm_composite_over_n_8_8888,     0 },

     { PIXMAN_OP_NONE },
 };

 const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;

 static void
 arm_simd_composite (pixman_implementation_t *imp,
                     pixman_op_t              op,
                     pixman_image_t *         src,
                     pixman_image_t *         mask,
                     pixman_image_t *         dest,
                     int32_t                  src_x,
                     int32_t                  src_y,
                     int32_t                  mask_x,
                     int32_t                  mask_y,
                     int32_t                  dest_x,
                     int32_t                  dest_y,
                     int32_t                  width,
                     int32_t                  height)
 {
     if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
                                op, src, mask, dest,
                                src_x, src_y,
                                mask_x, mask_y,
                                dest_x, dest_y,
                                width, height))
     {
 	return;
     }

     _pixman_implementation_composite (imp->delegate, op,
                                       src, mask, dest,
                                       src_x, src_y,
                                       mask_x, mask_y,
                                       dest_x, dest_y,
                                       width, height);
 }

 pixman_implementation_t *
 _pixman_implementation_create_arm_simd (void)
 {
     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
     pixman_implementation_t *imp = _pixman_implementation_create (general);

     imp->composite = arm_simd_composite;

     return imp;
 }
	/*
	* Copyright © 2008 Mozilla Corporation
	*
	* Permission to use, copy, modify, distribute, and sell this software and its
	* documentation for any purpose is hereby granted without fee, provided that
	* the above copyright notice appear in all copies and that both that
	* copyright notice and this permission notice appear in supporting
	* documentation, and that the name of Mozilla Corporation not be used in
	* advertising or publicity pertaining to distribution of the software without
	* specific, written prior permission. Mozilla Corporation makes no
	* representations about the suitability of this software for any purpose. It
	* is provided "as is" without express or implied warranty.
	*
	* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
	* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
	* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
	* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
	* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
	* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
	* SOFTWARE.
	*
	* Author: Jeff Muizelaar (jeff@infidigm.net)
	*
	*/
	#ifdef HAVE_CONFIG_H
	#include <config.h>
	#endif

	#include "pixman-private.h"

	static void
	arm_composite_add_8000_8000 (pixman_implementation_t * impl,
	pixman_op_t op,
	pixman_image_t * src_image,
	pixman_image_t * mask_image,
	pixman_image_t * dst_image,
	int32_t src_x,
	int32_t src_y,
	int32_t mask_x,
	int32_t mask_y,
	int32_t dest_x,
	int32_t dest_y,
	int32_t width,
	int32_t height)
	{
	uint8_t dst_line, dst;
	uint8_t src_line, src;
	int dst_stride, src_stride;
	uint16_t w;
	uint8_t s, d;

	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
	PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);

	while (height--)
	{
	dst = dst_line;
	dst_line += dst_stride;
	src = src_line;
	src_line += src_stride;
	w = width;

	/* ensure both src and dst are properly aligned before doing 32 bit reads
	* we'll stay in this loop if src and dst have differing alignments
	*/
	while (w && (((unsigned long)dst & 3) \|\| ((unsigned long)src & 3)))
	{
	s = *src;
	d = *dst;
	asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
	*dst = d;

	dst++;
	src++;
	w--;
	}

	while (w >= 4)
	{
	asm ("uqadd8 %0, %1, %2"
	: "=r" ((uint32_t)dst)
	: "r" ((uint32_t)src), "r" ((uint32_t)dst));
	dst += 4;
	src += 4;
	w -= 4;
	}

	while (w)
	{
	s = *src;
	d = *dst;
	asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
	*dst = d;

	dst++;
	src++;
	w--;
	}
	}

	}

	static void
	arm_composite_over_8888_8888 (pixman_implementation_t * impl,
	pixman_op_t op,
	pixman_image_t * src_image,
	pixman_image_t * mask_image,
	pixman_image_t * dst_image,
	int32_t src_x,
	int32_t src_y,
	int32_t mask_x,
	int32_t mask_y,
	int32_t dest_x,
	int32_t dest_y,
	int32_t width,
	int32_t height)
	{
	uint32_t dst_line, dst;
	uint32_t src_line, src;
	int dst_stride, src_stride;
	uint16_t w;
	uint32_t component_half = 0x800080;
	uint32_t upper_component_mask = 0xff00ff00;
	uint32_t alpha_mask = 0xff;

	PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

	while (height--)
	{
	dst = dst_line;
	dst_line += dst_stride;
	src = src_line;
	src_line += src_stride;
	w = width;

	/* #define inner_branch */
	asm volatile (
	"cmp %[w], #0\n\t"
	"beq 2f\n\t"
	"1:\n\t"
	/* load src */
	"ldr r5, [%[src]], #4\n\t"
	#ifdef inner_branch
	/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
	* The 0x0 case also allows us to avoid doing an unecessary data
	* write which is more valuable so we only check for that
	*/
	"cmp r5, #0\n\t"
	"beq 3f\n\t"

	/* = 255 - alpha */
	"sub r8, %[alpha_mask], r5, lsr #24\n\t"

	"ldr r4, [%[dest]] \n\t"

	#else
	"ldr r4, [%[dest]] \n\t"

	/* = 255 - alpha */
	"sub r8, %[alpha_mask], r5, lsr #24\n\t"
	#endif
	"uxtb16 r6, r4\n\t"
	"uxtb16 r7, r4, ror #8\n\t"

	/* multiply by 257 and divide by 65536 */
	"mla r6, r6, r8, %[component_half]\n\t"
	"mla r7, r7, r8, %[component_half]\n\t"

	"uxtab16 r6, r6, r6, ror #8\n\t"
	"uxtab16 r7, r7, r7, ror #8\n\t"

	/* recombine the 0xff00ff00 bytes of r6 and r7 */
	"and r7, r7, %[upper_component_mask]\n\t"
	"uxtab16 r6, r7, r6, ror #8\n\t"

	"uqadd8 r5, r6, r5\n\t"

	#ifdef inner_branch
	"3:\n\t"

	#endif
	"str r5, [%[dest]], #4\n\t"
	/* increment counter and jmp to top */
	"subs %[w], %[w], #1\n\t"
	"bne 1b\n\t"
	"2:\n\t"
	: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
	: [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
	[alpha_mask] "r" (alpha_mask)
	: "r4", "r5", "r6", "r7", "r8", "cc", "memory"
	);
	}
	}

	static void
	arm_composite_over_8888_n_8888 (
	pixman_implementation_t * impl,
	pixman_op_t op,
	pixman_image_t * src_image,
	pixman_image_t * mask_image,
	pixman_image_t * dst_image,
	int32_t src_x,
	int32_t src_y,
	int32_t mask_x,
	int32_t mask_y,
	int32_t dest_x,
	int32_t dest_y,
	int32_t width,
	int32_t height)
	{
	uint32_t dst_line, dst;
	uint32_t src_line, src;
	uint32_t mask;
	int dst_stride, src_stride;
	uint16_t w;
	uint32_t component_half = 0x800080;
	uint32_t alpha_mask = 0xff;

	PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

	mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
	mask = (mask) >> 24;

	while (height--)
	{
	dst = dst_line;
	dst_line += dst_stride;
	src = src_line;
	src_line += src_stride;
	w = width;

	/* #define inner_branch */
	asm volatile (
	"cmp %[w], #0\n\t"
	"beq 2f\n\t"
	"1:\n\t"
	/* load src */
	"ldr r5, [%[src]], #4\n\t"
	#ifdef inner_branch
	/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
	* The 0x0 case also allows us to avoid doing an unecessary data
	* write which is more valuable so we only check for that
	*/
	"cmp r5, #0\n\t"
	"beq 3f\n\t"

	#endif
	"ldr r4, [%[dest]] \n\t"

	"uxtb16 r6, r5\n\t"
	"uxtb16 r7, r5, ror #8\n\t"

	/* multiply by alpha (r8) then by 257 and divide by 65536 */
	"mla r6, r6, %[mask_alpha], %[component_half]\n\t"
	"mla r7, r7, %[mask_alpha], %[component_half]\n\t"

	"uxtab16 r6, r6, r6, ror #8\n\t"
	"uxtab16 r7, r7, r7, ror #8\n\t"

	"uxtb16 r6, r6, ror #8\n\t"
	"uxtb16 r7, r7, ror #8\n\t"

	/* recombine */
	"orr r5, r6, r7, lsl #8\n\t"

	"uxtb16 r6, r4\n\t"
	"uxtb16 r7, r4, ror #8\n\t"

	/* 255 - alpha */
	"sub r8, %[alpha_mask], r5, lsr #24\n\t"

	/* multiply by alpha (r8) then by 257 and divide by 65536 */
	"mla r6, r6, r8, %[component_half]\n\t"
	"mla r7, r7, r8, %[component_half]\n\t"

	"uxtab16 r6, r6, r6, ror #8\n\t"
	"uxtab16 r7, r7, r7, ror #8\n\t"

	"uxtb16 r6, r6, ror #8\n\t"
	"uxtb16 r7, r7, ror #8\n\t"

	/* recombine */
	"orr r6, r6, r7, lsl #8\n\t"

	"uqadd8 r5, r6, r5\n\t"

	#ifdef inner_branch
	"3:\n\t"

	#endif
	"str r5, [%[dest]], #4\n\t"
	/* increment counter and jmp to top */
	"subs %[w], %[w], #1\n\t"
	"bne 1b\n\t"
	"2:\n\t"
	: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
	: [component_half] "r" (component_half), [mask_alpha] "r" (mask),
	[alpha_mask] "r" (alpha_mask)
	: "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
	);
	}
	}

	static void
	arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
	pixman_op_t op,
	pixman_image_t * src_image,
	pixman_image_t * mask_image,
	pixman_image_t * dst_image,
	int32_t src_x,
	int32_t src_y,
	int32_t mask_x,
	int32_t mask_y,
	int32_t dest_x,
	int32_t dest_y,
	int32_t width,
	int32_t height)
	{
	uint32_t src, srca;
	uint32_t dst_line, dst;
	uint8_t mask_line, mask;
	int dst_stride, mask_stride;
	uint16_t w;

	src = _pixman_image_get_solid (src_image, dst_image->bits.format);

	/* bail out if fully transparent */
	srca = src >> 24;
	if (src == 0)
	return;

	uint32_t component_mask = 0xff00ff;
	uint32_t component_half = 0x800080;

	uint32_t src_hi = (src >> 8) & component_mask;
	uint32_t src_lo = src & component_mask;

	PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

	while (height--)
	{
	dst = dst_line;
	dst_line += dst_stride;
	mask = mask_line;
	mask_line += mask_stride;
	w = width;

	/* #define inner_branch */
	asm volatile (
	"cmp %[w], #0\n\t"
	"beq 2f\n\t"
	"1:\n\t"
	/* load mask */
	"ldrb r5, [%[mask]], #1\n\t"
	#ifdef inner_branch
	/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
	* The 0x0 case also allows us to avoid doing an unecessary data
	* write which is more valuable so we only check for that
	*/
	"cmp r5, #0\n\t"
	"beq 3f\n\t"

	#endif
	"ldr r4, [%[dest]] \n\t"

	/* multiply by alpha (r8) then by 257 and divide by 65536 */
	"mla r6, %[src_lo], r5, %[component_half]\n\t"
	"mla r7, %[src_hi], r5, %[component_half]\n\t"

	"uxtab16 r6, r6, r6, ror #8\n\t"
	"uxtab16 r7, r7, r7, ror #8\n\t"

	"uxtb16 r6, r6, ror #8\n\t"
	"uxtb16 r7, r7, ror #8\n\t"

	/* recombine */
	"orr r5, r6, r7, lsl #8\n\t"

	"uxtb16 r6, r4\n\t"
	"uxtb16 r7, r4, ror #8\n\t"

	/* we could simplify this to use 'sub' if we were
	* willing to give up a register for alpha_mask */
	"mvn r8, r5\n\t"
	"mov r8, r8, lsr #24\n\t"

	/* multiply by alpha (r8) then by 257 and divide by 65536 */
	"mla r6, r6, r8, %[component_half]\n\t"
	"mla r7, r7, r8, %[component_half]\n\t"

	"uxtab16 r6, r6, r6, ror #8\n\t"
	"uxtab16 r7, r7, r7, ror #8\n\t"

	"uxtb16 r6, r6, ror #8\n\t"
	"uxtb16 r7, r7, ror #8\n\t"

	/* recombine */
	"orr r6, r6, r7, lsl #8\n\t"

	"uqadd8 r5, r6, r5\n\t"

	#ifdef inner_branch
	"3:\n\t"

	#endif
	"str r5, [%[dest]], #4\n\t"
	/* increment counter and jmp to top */
	"subs %[w], %[w], #1\n\t"
	"bne 1b\n\t"
	"2:\n\t"
	: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
	: [component_half] "r" (component_half),
	[src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
	: "r4", "r5", "r6", "r7", "r8", "cc", "memory");
	}
	}

	static const pixman_fast_path_t arm_simd_fast_path_array[] =
	{
	{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, arm_composite_over_8888_8888, 0 },
	{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, arm_composite_over_8888_8888, 0 },
	{ PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, arm_composite_over_8888_8888, 0 },
	{ PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, arm_composite_over_8888_8888, 0 },
	{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888, NEED_SOLID_MASK },
	{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888, NEED_SOLID_MASK },

	{ PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, arm_composite_add_8000_8000, 0 },

	{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, arm_composite_over_n_8_8888, 0 },
	{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, arm_composite_over_n_8_8888, 0 },
	{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, arm_composite_over_n_8_8888, 0 },
	{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, arm_composite_over_n_8_8888, 0 },

	{ PIXMAN_OP_NONE },
	};

	const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;

	static void
	arm_simd_composite (pixman_implementation_t *imp,
	pixman_op_t op,
	pixman_image_t * src,
	pixman_image_t * mask,
	pixman_image_t * dest,
	int32_t src_x,
	int32_t src_y,
	int32_t mask_x,
	int32_t mask_y,
	int32_t dest_x,
	int32_t dest_y,
	int32_t width,
	int32_t height)
	{
	if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
	op, src, mask, dest,
	src_x, src_y,
	mask_x, mask_y,
	dest_x, dest_y,
	width, height))
	{
	return;
	}

	_pixman_implementation_composite (imp->delegate, op,
	src, mask, dest,
	src_x, src_y,
	mask_x, mask_y,
	dest_x, dest_y,
	width, height);
	}

	pixman_implementation_t *
	_pixman_implementation_create_arm_simd (void)
	{
	pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
	pixman_implementation_t *imp = _pixman_implementation_create (general);

	imp->composite = arm_simd_composite;

	return imp;
	}