blob: fb7bf3da8424e89d748edfe8e2eee1736ca32c67 [file] [log] [blame]
/*
* Copyright © 2008 Mozilla Corporation
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of Mozilla Corporation not be used in
* advertising or publicity pertaining to distribution of the software without
* specific, written prior permission. Mozilla Corporation makes no
* representations about the suitability of this software for any purpose. It
* is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
* Author: Jeff Muizelaar (jeff@infidigm.net)
*
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include "pixman-private.h"
static void
arm_composite_add_8000_8000 (pixman_implementation_t * impl,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint8_t *dst_line, *dst;
uint8_t *src_line, *src;
int dst_stride, src_stride;
uint16_t w;
uint8_t s, d;
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
/* ensure both src and dst are properly aligned before doing 32 bit reads
* we'll stay in this loop if src and dst have differing alignments
*/
while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
{
s = *src;
d = *dst;
asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
*dst = d;
dst++;
src++;
w--;
}
while (w >= 4)
{
asm ("uqadd8 %0, %1, %2"
: "=r" (*(uint32_t*)dst)
: "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
dst += 4;
src += 4;
w -= 4;
}
while (w)
{
s = *src;
d = *dst;
asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
*dst = d;
dst++;
src++;
w--;
}
}
}
static void
arm_composite_over_8888_8888 (pixman_implementation_t * impl,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
int dst_stride, src_stride;
uint16_t w;
uint32_t component_half = 0x800080;
uint32_t upper_component_mask = 0xff00ff00;
uint32_t alpha_mask = 0xff;
PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
/* #define inner_branch */
asm volatile (
"cmp %[w], #0\n\t"
"beq 2f\n\t"
"1:\n\t"
/* load src */
"ldr r5, [%[src]], #4\n\t"
#ifdef inner_branch
/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
* The 0x0 case also allows us to avoid doing an unecessary data
* write which is more valuable so we only check for that
*/
"cmp r5, #0\n\t"
"beq 3f\n\t"
/* = 255 - alpha */
"sub r8, %[alpha_mask], r5, lsr #24\n\t"
"ldr r4, [%[dest]] \n\t"
#else
"ldr r4, [%[dest]] \n\t"
/* = 255 - alpha */
"sub r8, %[alpha_mask], r5, lsr #24\n\t"
#endif
"uxtb16 r6, r4\n\t"
"uxtb16 r7, r4, ror #8\n\t"
/* multiply by 257 and divide by 65536 */
"mla r6, r6, r8, %[component_half]\n\t"
"mla r7, r7, r8, %[component_half]\n\t"
"uxtab16 r6, r6, r6, ror #8\n\t"
"uxtab16 r7, r7, r7, ror #8\n\t"
/* recombine the 0xff00ff00 bytes of r6 and r7 */
"and r7, r7, %[upper_component_mask]\n\t"
"uxtab16 r6, r7, r6, ror #8\n\t"
"uqadd8 r5, r6, r5\n\t"
#ifdef inner_branch
"3:\n\t"
#endif
"str r5, [%[dest]], #4\n\t"
/* increment counter and jmp to top */
"subs %[w], %[w], #1\n\t"
"bne 1b\n\t"
"2:\n\t"
: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
: [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
[alpha_mask] "r" (alpha_mask)
: "r4", "r5", "r6", "r7", "r8", "cc", "memory"
);
}
}
static void
arm_composite_over_8888_n_8888 (
pixman_implementation_t * impl,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
uint32_t mask;
int dst_stride, src_stride;
uint16_t w;
uint32_t component_half = 0x800080;
uint32_t alpha_mask = 0xff;
PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
mask = (mask) >> 24;
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
/* #define inner_branch */
asm volatile (
"cmp %[w], #0\n\t"
"beq 2f\n\t"
"1:\n\t"
/* load src */
"ldr r5, [%[src]], #4\n\t"
#ifdef inner_branch
/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
* The 0x0 case also allows us to avoid doing an unecessary data
* write which is more valuable so we only check for that
*/
"cmp r5, #0\n\t"
"beq 3f\n\t"
#endif
"ldr r4, [%[dest]] \n\t"
"uxtb16 r6, r5\n\t"
"uxtb16 r7, r5, ror #8\n\t"
/* multiply by alpha (r8) then by 257 and divide by 65536 */
"mla r6, r6, %[mask_alpha], %[component_half]\n\t"
"mla r7, r7, %[mask_alpha], %[component_half]\n\t"
"uxtab16 r6, r6, r6, ror #8\n\t"
"uxtab16 r7, r7, r7, ror #8\n\t"
"uxtb16 r6, r6, ror #8\n\t"
"uxtb16 r7, r7, ror #8\n\t"
/* recombine */
"orr r5, r6, r7, lsl #8\n\t"
"uxtb16 r6, r4\n\t"
"uxtb16 r7, r4, ror #8\n\t"
/* 255 - alpha */
"sub r8, %[alpha_mask], r5, lsr #24\n\t"
/* multiply by alpha (r8) then by 257 and divide by 65536 */
"mla r6, r6, r8, %[component_half]\n\t"
"mla r7, r7, r8, %[component_half]\n\t"
"uxtab16 r6, r6, r6, ror #8\n\t"
"uxtab16 r7, r7, r7, ror #8\n\t"
"uxtb16 r6, r6, ror #8\n\t"
"uxtb16 r7, r7, ror #8\n\t"
/* recombine */
"orr r6, r6, r7, lsl #8\n\t"
"uqadd8 r5, r6, r5\n\t"
#ifdef inner_branch
"3:\n\t"
#endif
"str r5, [%[dest]], #4\n\t"
/* increment counter and jmp to top */
"subs %[w], %[w], #1\n\t"
"bne 1b\n\t"
"2:\n\t"
: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
: [component_half] "r" (component_half), [mask_alpha] "r" (mask),
[alpha_mask] "r" (alpha_mask)
: "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
);
}
}
static void
arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src, srca;
uint32_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
uint16_t w;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
/* bail out if fully transparent */
srca = src >> 24;
if (src == 0)
return;
uint32_t component_mask = 0xff00ff;
uint32_t component_half = 0x800080;
uint32_t src_hi = (src >> 8) & component_mask;
uint32_t src_lo = src & component_mask;
PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
/* #define inner_branch */
asm volatile (
"cmp %[w], #0\n\t"
"beq 2f\n\t"
"1:\n\t"
/* load mask */
"ldrb r5, [%[mask]], #1\n\t"
#ifdef inner_branch
/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
* The 0x0 case also allows us to avoid doing an unecessary data
* write which is more valuable so we only check for that
*/
"cmp r5, #0\n\t"
"beq 3f\n\t"
#endif
"ldr r4, [%[dest]] \n\t"
/* multiply by alpha (r8) then by 257 and divide by 65536 */
"mla r6, %[src_lo], r5, %[component_half]\n\t"
"mla r7, %[src_hi], r5, %[component_half]\n\t"
"uxtab16 r6, r6, r6, ror #8\n\t"
"uxtab16 r7, r7, r7, ror #8\n\t"
"uxtb16 r6, r6, ror #8\n\t"
"uxtb16 r7, r7, ror #8\n\t"
/* recombine */
"orr r5, r6, r7, lsl #8\n\t"
"uxtb16 r6, r4\n\t"
"uxtb16 r7, r4, ror #8\n\t"
/* we could simplify this to use 'sub' if we were
* willing to give up a register for alpha_mask */
"mvn r8, r5\n\t"
"mov r8, r8, lsr #24\n\t"
/* multiply by alpha (r8) then by 257 and divide by 65536 */
"mla r6, r6, r8, %[component_half]\n\t"
"mla r7, r7, r8, %[component_half]\n\t"
"uxtab16 r6, r6, r6, ror #8\n\t"
"uxtab16 r7, r7, r7, ror #8\n\t"
"uxtb16 r6, r6, ror #8\n\t"
"uxtb16 r7, r7, ror #8\n\t"
/* recombine */
"orr r6, r6, r7, lsl #8\n\t"
"uqadd8 r5, r6, r5\n\t"
#ifdef inner_branch
"3:\n\t"
#endif
"str r5, [%[dest]], #4\n\t"
/* increment counter and jmp to top */
"subs %[w], %[w], #1\n\t"
"bne 1b\n\t"
"2:\n\t"
: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
: [component_half] "r" (component_half),
[src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
: "r4", "r5", "r6", "r7", "r8", "cc", "memory");
}
}
static const pixman_fast_path_t arm_simd_fast_path_array[] =
{
{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, arm_composite_over_8888_8888, 0 },
{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, arm_composite_over_8888_8888, 0 },
{ PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, arm_composite_over_8888_8888, 0 },
{ PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, arm_composite_over_8888_8888, 0 },
{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888, NEED_SOLID_MASK },
{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888, NEED_SOLID_MASK },
{ PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, arm_composite_add_8000_8000, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, arm_composite_over_n_8_8888, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, arm_composite_over_n_8_8888, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, arm_composite_over_n_8_8888, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, arm_composite_over_n_8_8888, 0 },
{ PIXMAN_OP_NONE },
};
const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
static void
arm_simd_composite (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src,
pixman_image_t * mask,
pixman_image_t * dest,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
op, src, mask, dest,
src_x, src_y,
mask_x, mask_y,
dest_x, dest_y,
width, height))
{
return;
}
_pixman_implementation_composite (imp->delegate, op,
src, mask, dest,
src_x, src_y,
mask_x, mask_y,
dest_x, dest_y,
width, height);
}
pixman_implementation_t *
_pixman_implementation_create_arm_simd (void)
{
pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
pixman_implementation_t *imp = _pixman_implementation_create (general);
imp->composite = arm_simd_composite;
return imp;
}