blob: 5907de057e1dbe70dbe7cf836db69dab9fa79aed [file] [log] [blame]
/*
* Copyright © 2008 Rodrigo Kumpera
* Copyright © 2008 André Tupinambá
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of Red Hat not be used in advertising or
* publicity pertaining to distribution of the software without specific,
* written prior permission. Red Hat makes no representations about the
* suitability of this software for any purpose. It is provided "as is"
* without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
* Author: Rodrigo Kumpera (kumpera@gmail.com)
* André Tupinambá (andrelrt@gmail.com)
*
* Based on work by Owen Taylor and Søren Sandmann
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <mmintrin.h>
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
#include <emmintrin.h> /* for SSE2 intrinsics */
#include "pixman-private.h"
#include "pixman-combine32.h"
#include "pixman-fast-path.h"
#if defined(_MSC_VER) && defined(_M_AMD64)
/* Windows 64 doesn't allow MMX to be used, so
* the pixman-x64-mmx-emulation.h file contains
* implementations of those MMX intrinsics that
* are used in the SSE2 implementation.
*/
# include "pixman-x64-mmx-emulation.h"
#endif
#ifdef USE_SSE2
/* --------------------------------------------------------------------
* Locals
*/
static __m64 mask_x0080;
static __m64 mask_x00ff;
static __m64 mask_x0101;
static __m64 mask_x_alpha;
static __m64 mask_x565_rgb;
static __m64 mask_x565_unpack;
static __m128i mask_0080;
static __m128i mask_00ff;
static __m128i mask_0101;
static __m128i mask_ffff;
static __m128i mask_ff000000;
static __m128i mask_alpha;
static __m128i mask_565_r;
static __m128i mask_565_g1, mask_565_g2;
static __m128i mask_565_b;
static __m128i mask_red;
static __m128i mask_green;
static __m128i mask_blue;
static __m128i mask_565_fix_rb;
static __m128i mask_565_fix_g;
/* ----------------------------------------------------------------------
* SSE2 Inlines
*/
static force_inline __m128i
unpack_32_1x128 (uint32_t data)
{
return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
}
static force_inline void
unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
{
*data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
*data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
}
static force_inline __m128i
unpack_565_to_8888 (__m128i lo)
{
__m128i r, g, b, rb, t;
r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
rb = _mm_or_si128 (r, b);
t = _mm_and_si128 (rb, mask_565_fix_rb);
t = _mm_srli_epi32 (t, 5);
rb = _mm_or_si128 (rb, t);
t = _mm_and_si128 (g, mask_565_fix_g);
t = _mm_srli_epi32 (t, 6);
g = _mm_or_si128 (g, t);
return _mm_or_si128 (rb, g);
}
static force_inline void
unpack_565_128_4x128 (__m128i data,
__m128i* data0,
__m128i* data1,
__m128i* data2,
__m128i* data3)
{
__m128i lo, hi;
lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
lo = unpack_565_to_8888 (lo);
hi = unpack_565_to_8888 (hi);
unpack_128_2x128 (lo, data0, data1);
unpack_128_2x128 (hi, data2, data3);
}
static force_inline uint16_t
pack_565_32_16 (uint32_t pixel)
{
return (uint16_t) (((pixel >> 8) & 0xf800) |
((pixel >> 5) & 0x07e0) |
((pixel >> 3) & 0x001f));
}
static force_inline __m128i
pack_2x128_128 (__m128i lo, __m128i hi)
{
return _mm_packus_epi16 (lo, hi);
}
static force_inline __m128i
pack_565_2x128_128 (__m128i lo, __m128i hi)
{
__m128i data;
__m128i r, g1, g2, b;
data = pack_2x128_128 (lo, hi);
r = _mm_and_si128 (data, mask_565_r);
g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
}
static force_inline __m128i
pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
{
return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
pack_565_2x128_128 (*xmm2, *xmm3));
}
static force_inline int
is_opaque (__m128i x)
{
__m128i ffs = _mm_cmpeq_epi8 (x, x);
return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
}
static force_inline int
is_zero (__m128i x)
{
return _mm_movemask_epi8 (
_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
}
static force_inline int
is_transparent (__m128i x)
{
return (_mm_movemask_epi8 (
_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
}
static force_inline __m128i
expand_pixel_32_1x128 (uint32_t data)
{
return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
}
static force_inline __m128i
expand_alpha_1x128 (__m128i data)
{
return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
_MM_SHUFFLE (3, 3, 3, 3)),
_MM_SHUFFLE (3, 3, 3, 3));
}
static force_inline void
expand_alpha_2x128 (__m128i data_lo,
__m128i data_hi,
__m128i* alpha_lo,
__m128i* alpha_hi)
{
__m128i lo, hi;
lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
*alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
*alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
}
static force_inline void
expand_alpha_rev_2x128 (__m128i data_lo,
__m128i data_hi,
__m128i* alpha_lo,
__m128i* alpha_hi)
{
__m128i lo, hi;
lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
*alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
*alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
}
static force_inline void
pix_multiply_2x128 (__m128i* data_lo,
__m128i* data_hi,
__m128i* alpha_lo,
__m128i* alpha_hi,
__m128i* ret_lo,
__m128i* ret_hi)
{
__m128i lo, hi;
lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
lo = _mm_adds_epu16 (lo, mask_0080);
hi = _mm_adds_epu16 (hi, mask_0080);
*ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
*ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
}
static force_inline void
pix_add_multiply_2x128 (__m128i* src_lo,
__m128i* src_hi,
__m128i* alpha_dst_lo,
__m128i* alpha_dst_hi,
__m128i* dst_lo,
__m128i* dst_hi,
__m128i* alpha_src_lo,
__m128i* alpha_src_hi,
__m128i* ret_lo,
__m128i* ret_hi)
{
__m128i t1_lo, t1_hi;
__m128i t2_lo, t2_hi;
pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
*ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
*ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
}
static force_inline void
negate_2x128 (__m128i data_lo,
__m128i data_hi,
__m128i* neg_lo,
__m128i* neg_hi)
{
*neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
*neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
}
static force_inline void
invert_colors_2x128 (__m128i data_lo,
__m128i data_hi,
__m128i* inv_lo,
__m128i* inv_hi)
{
__m128i lo, hi;
lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
*inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
*inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
}
static force_inline void
over_2x128 (__m128i* src_lo,
__m128i* src_hi,
__m128i* alpha_lo,
__m128i* alpha_hi,
__m128i* dst_lo,
__m128i* dst_hi)
{
__m128i t1, t2;
negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
*dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
*dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
}
static force_inline void
over_rev_non_pre_2x128 (__m128i src_lo,
__m128i src_hi,
__m128i* dst_lo,
__m128i* dst_hi)
{
__m128i lo, hi;
__m128i alpha_lo, alpha_hi;
expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
lo = _mm_or_si128 (alpha_lo, mask_alpha);
hi = _mm_or_si128 (alpha_hi, mask_alpha);
invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
}
static force_inline void
in_over_2x128 (__m128i* src_lo,
__m128i* src_hi,
__m128i* alpha_lo,
__m128i* alpha_hi,
__m128i* mask_lo,
__m128i* mask_hi,
__m128i* dst_lo,
__m128i* dst_hi)
{
__m128i s_lo, s_hi;
__m128i a_lo, a_hi;
pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
}
/* load 4 pixels from a 16-byte boundary aligned address */
static force_inline __m128i
load_128_aligned (__m128i* src)
{
return _mm_load_si128 (src);
}
/* load 4 pixels from a unaligned address */
static force_inline __m128i
load_128_unaligned (const __m128i* src)
{
return _mm_loadu_si128 (src);
}
/* save 4 pixels using Write Combining memory on a 16-byte
* boundary aligned address
*/
static force_inline void
save_128_write_combining (__m128i* dst,
__m128i data)
{
_mm_stream_si128 (dst, data);
}
/* save 4 pixels on a 16-byte boundary aligned address */
static force_inline void
save_128_aligned (__m128i* dst,
__m128i data)
{
_mm_store_si128 (dst, data);
}
/* save 4 pixels on a unaligned address */
static force_inline void
save_128_unaligned (__m128i* dst,
__m128i data)
{
_mm_storeu_si128 (dst, data);
}
/* ------------------------------------------------------------------
* MMX inlines
*/
static force_inline __m64
load_32_1x64 (uint32_t data)
{
return _mm_cvtsi32_si64 (data);
}
static force_inline __m64
unpack_32_1x64 (uint32_t data)
{
return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
}
static force_inline __m64
expand_alpha_1x64 (__m64 data)
{
return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
}
static force_inline __m64
expand_alpha_rev_1x64 (__m64 data)
{
return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
}
static force_inline __m64
expand_pixel_8_1x64 (uint8_t data)
{
return _mm_shuffle_pi16 (
unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
}
static force_inline __m64
pix_multiply_1x64 (__m64 data,
__m64 alpha)
{
return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
mask_x0080),
mask_x0101);
}
static force_inline __m64
pix_add_multiply_1x64 (__m64* src,
__m64* alpha_dst,
__m64* dst,
__m64* alpha_src)
{
__m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
__m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
return _mm_adds_pu8 (t1, t2);
}
static force_inline __m64
negate_1x64 (__m64 data)
{
return _mm_xor_si64 (data, mask_x00ff);
}
static force_inline __m64
invert_colors_1x64 (__m64 data)
{
return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
}
static force_inline __m64
over_1x64 (__m64 src, __m64 alpha, __m64 dst)
{
return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
}
static force_inline __m64
in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
{
return over_1x64 (pix_multiply_1x64 (*src, *mask),
pix_multiply_1x64 (*alpha, *mask),
*dst);
}
static force_inline __m64
over_rev_non_pre_1x64 (__m64 src, __m64 dst)
{
__m64 alpha = expand_alpha_1x64 (src);
return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
_mm_or_si64 (alpha, mask_x_alpha)),
alpha,
dst);
}
static force_inline uint32_t
pack_1x64_32 (__m64 data)
{
return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
}
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
*
* 00RR00GG00BB
*
* --- Expanding 565 in the low word ---
*
* m = (m << (32 - 3)) | (m << (16 - 5)) | m;
* m = m & (01f0003f001f);
* m = m * (008404100840);
* m = m >> 8;
*
* Note the trick here - the top word is shifted by another nibble to
* avoid it bumping into the middle word
*/
static force_inline __m64
expand565_16_1x64 (uint16_t pixel)
{
__m64 p;
__m64 t1, t2;
p = _mm_cvtsi32_si64 ((uint32_t) pixel);
t1 = _mm_slli_si64 (p, 36 - 11);
t2 = _mm_slli_si64 (p, 16 - 5);
p = _mm_or_si64 (t1, p);
p = _mm_or_si64 (t2, p);
p = _mm_and_si64 (p, mask_x565_rgb);
p = _mm_mullo_pi16 (p, mask_x565_unpack);
return _mm_srli_pi16 (p, 8);
}
/* ----------------------------------------------------------------------------
* Compose Core transformations
*/
static force_inline uint32_t
core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
{
uint8_t a;
__m64 ms;
a = src >> 24;
if (a == 0xff)
{
return src;
}
else if (src)
{
ms = unpack_32_1x64 (src);
return pack_1x64_32 (
over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
}
return dst;
}
static force_inline uint32_t
combine1 (const uint32_t *ps, const uint32_t *pm)
{
uint32_t s = *ps;
if (pm)
{
__m64 ms, mm;
mm = unpack_32_1x64 (*pm);
mm = expand_alpha_1x64 (mm);
ms = unpack_32_1x64 (s);
ms = pix_multiply_1x64 (ms, mm);
s = pack_1x64_32 (ms);
}
return s;
}
static force_inline __m128i
combine4 (const __m128i *ps, const __m128i *pm)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_msk_lo, xmm_msk_hi;
__m128i s;
if (pm)
{
xmm_msk_lo = load_128_unaligned (pm);
if (is_transparent (xmm_msk_lo))
return _mm_setzero_si128 ();
}
s = load_128_unaligned (ps);
if (pm)
{
unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_msk_lo, &xmm_msk_hi,
&xmm_src_lo, &xmm_src_hi);
s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
}
return s;
}
static force_inline void
core_combine_over_u_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t* pm,
int w)
{
uint32_t s, d;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
{
d = *pd;
s = combine1 (ps, pm);
*pd++ = core_combine_over_u_pixel_sse2 (s, d);
ps++;
if (pm)
pm++;
w--;
}
while (w >= 4)
{
/* I'm loading unaligned because I'm not sure about
* the address alignment.
*/
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
if (is_opaque (xmm_src_hi))
{
save_128_aligned ((__m128i*)pd, xmm_src_hi);
}
else if (!is_zero (xmm_src_hi))
{
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (
xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
over_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_dst_lo, &xmm_dst_hi);
/* rebuid the 4 pixel data and save*/
save_128_aligned ((__m128i*)pd,
pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
w -= 4;
ps += 4;
pd += 4;
if (pm)
pm += 4;
}
while (w)
{
d = *pd;
s = combine1 (ps, pm);
*pd++ = core_combine_over_u_pixel_sse2 (s, d);
ps++;
if (pm)
pm++;
w--;
}
}
static force_inline void
core_combine_over_reverse_u_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t* pm,
int w)
{
uint32_t s, d;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)pd & 15))
{
d = *pd;
s = combine1 (ps, pm);
*pd++ = core_combine_over_u_pixel_sse2 (d, s);
w--;
ps++;
if (pm)
pm++;
}
while (w >= 4)
{
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_src_lo, &xmm_src_hi);
/* rebuid the 4 pixel data and save*/
save_128_aligned ((__m128i*)pd,
pack_2x128_128 (xmm_src_lo, xmm_src_hi));
w -= 4;
ps += 4;
pd += 4;
if (pm)
pm += 4;
}
while (w)
{
d = *pd;
s = combine1 (ps, pm);
*pd++ = core_combine_over_u_pixel_sse2 (d, s);
ps++;
w--;
if (pm)
pm++;
}
}
static force_inline uint32_t
core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
{
uint32_t maska = src >> 24;
if (maska == 0)
{
return 0;
}
else if (maska != 0xff)
{
return pack_1x64_32 (
pix_multiply_1x64 (unpack_32_1x64 (dst),
expand_alpha_1x64 (unpack_32_1x64 (src))));
}
return dst;
}
static force_inline void
core_combine_in_u_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t* pm,
int w)
{
uint32_t s, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_in_u_pixelsse2 (d, s);
w--;
ps++;
if (pm)
pm++;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_dst_lo, &xmm_dst_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned ((__m128i*)pd,
pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
w -= 4;
if (pm)
pm += 4;
}
while (w)
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_in_u_pixelsse2 (d, s);
w--;
ps++;
if (pm)
pm++;
}
}
static force_inline void
core_combine_reverse_in_u_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t *pm,
int w)
{
uint32_t s, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_in_u_pixelsse2 (s, d);
ps++;
w--;
if (pm)
pm++;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_src_lo, &xmm_src_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
w -= 4;
if (pm)
pm += 4;
}
while (w)
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_in_u_pixelsse2 (s, d);
w--;
ps++;
if (pm)
pm++;
}
}
static force_inline void
core_combine_reverse_out_u_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t* pm,
int w)
{
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (d), negate_1x64 (
expand_alpha_1x64 (unpack_32_1x64 (s)))));
if (pm)
pm++;
ps++;
w--;
}
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_src_lo, &xmm_src_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
if (pm)
pm += 4;
w -= 4;
}
while (w)
{
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (d), negate_1x64 (
expand_alpha_1x64 (unpack_32_1x64 (s)))));
ps++;
if (pm)
pm++;
w--;
}
}
static force_inline void
core_combine_out_u_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t* pm,
int w)
{
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (s), negate_1x64 (
expand_alpha_1x64 (unpack_32_1x64 (d)))));
w--;
ps++;
if (pm)
pm++;
}
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_dst_lo, &xmm_dst_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
w -= 4;
if (pm)
pm += 4;
}
while (w)
{
uint32_t s = combine1 (ps, pm);
uint32_t d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (s), negate_1x64 (
expand_alpha_1x64 (unpack_32_1x64 (d)))));
w--;
ps++;
if (pm)
pm++;
}
}
static force_inline uint32_t
core_combine_atop_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
__m64 sa = negate_1x64 (expand_alpha_1x64 (s));
__m64 da = expand_alpha_1x64 (d);
return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
}
static force_inline void
core_combine_atop_u_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t* pm,
int w)
{
uint32_t s, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
w--;
ps++;
if (pm)
pm++;
}
while (w >= 4)
{
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi);
pix_add_multiply_2x128 (
&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
w -= 4;
if (pm)
pm += 4;
}
while (w)
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
w--;
ps++;
if (pm)
pm++;
}
}
static force_inline uint32_t
core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
__m64 sa = expand_alpha_1x64 (s);
__m64 da = negate_1x64 (expand_alpha_1x64 (d));
return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
}
static force_inline void
core_combine_reverse_atop_u_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t* pm,
int w)
{
uint32_t s, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
ps++;
w--;
if (pm)
pm++;
}
while (w >= 4)
{
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
pix_add_multiply_2x128 (
&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
w -= 4;
if (pm)
pm += 4;
}
while (w)
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
ps++;
w--;
if (pm)
pm++;
}
}
static force_inline uint32_t
core_combine_xor_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
__m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
__m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
}
static force_inline void
core_combine_xor_u_sse2 (uint32_t* dst,
const uint32_t* src,
const uint32_t *mask,
int width)
{
int w = width;
uint32_t s, d;
uint32_t* pd = dst;
const uint32_t* ps = src;
const uint32_t* pm = mask;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
w--;
ps++;
if (pm)
pm++;
}
while (w >= 4)
{
xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
xmm_dst = load_128_aligned ((__m128i*) pd);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi);
negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
pix_add_multiply_2x128 (
&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
w -= 4;
if (pm)
pm += 4;
}
while (w)
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
w--;
ps++;
if (pm)
pm++;
}
}
static force_inline void
core_combine_add_u_sse2 (uint32_t* dst,
const uint32_t* src,
const uint32_t* mask,
int width)
{
int w = width;
uint32_t s, d;
uint32_t* pd = dst;
const uint32_t* ps = src;
const uint32_t* pm = mask;
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
d = *pd;
ps++;
if (pm)
pm++;
*pd++ = _mm_cvtsi64_si32 (
_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
w--;
}
while (w >= 4)
{
__m128i s;
s = combine4 ((__m128i*)ps, (__m128i*)pm);
save_128_aligned (
(__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
pd += 4;
ps += 4;
if (pm)
pm += 4;
w -= 4;
}
while (w--)
{
s = combine1 (ps, pm);
d = *pd;
ps++;
*pd++ = _mm_cvtsi64_si32 (
_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
if (pm)
pm++;
}
}
static force_inline uint32_t
core_combine_saturate_u_pixel_sse2 (uint32_t src,
uint32_t dst)
{
__m64 ms = unpack_32_1x64 (src);
__m64 md = unpack_32_1x64 (dst);
uint32_t sa = src >> 24;
uint32_t da = ~dst >> 24;
if (sa > da)
{
ms = pix_multiply_1x64 (
ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
}
return pack_1x64_32 (_mm_adds_pu16 (md, ms));
}
static force_inline void
core_combine_saturate_u_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, d;
uint32_t pack_cmp;
__m128i xmm_src, xmm_dst;
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
w--;
ps++;
if (pm)
pm++;
}
while (w >= 4)
{
xmm_dst = load_128_aligned ((__m128i*)pd);
xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
pack_cmp = _mm_movemask_epi8 (
_mm_cmpgt_epi32 (
_mm_srli_epi32 (xmm_src, 24),
_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
/* if some alpha src is grater than respective ~alpha dst */
if (pack_cmp)
{
s = combine1 (ps++, pm);
d = *pd;
*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
if (pm)
pm++;
s = combine1 (ps++, pm);
d = *pd;
*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
if (pm)
pm++;
s = combine1 (ps++, pm);
d = *pd;
*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
if (pm)
pm++;
s = combine1 (ps++, pm);
d = *pd;
*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
if (pm)
pm++;
}
else
{
save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
pd += 4;
ps += 4;
if (pm)
pm += 4;
}
w -= 4;
}
while (w--)
{
s = combine1 (ps, pm);
d = *pd;
*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
ps++;
if (pm)
pm++;
}
}
static force_inline void
core_combine_src_ca_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t *pm,
int w)
{
uint32_t s, m;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
w--;
}
while (w >= 4)
{
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
w--;
}
}
static force_inline uint32_t
core_combine_over_ca_pixel_sse2 (uint32_t src,
uint32_t mask,
uint32_t dst)
{
__m64 s = unpack_32_1x64 (src);
__m64 expAlpha = expand_alpha_1x64 (s);
__m64 unpk_mask = unpack_32_1x64 (mask);
__m64 unpk_dst = unpack_32_1x64 (dst);
return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
}
static force_inline void
core_combine_over_ca_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
w--;
}
}
static force_inline uint32_t
core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
uint32_t mask,
uint32_t dst)
{
__m64 d = unpack_32_1x64 (dst);
return pack_1x64_32 (
over_1x64 (d, expand_alpha_1x64 (d),
pix_multiply_1x64 (unpack_32_1x64 (src),
unpack_32_1x64 (mask))));
}
static force_inline void
core_combine_over_reverse_ca_sse2 (uint32_t* pd,
const uint32_t* ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_mask_lo, &xmm_mask_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
w--;
}
}
static force_inline void
core_combine_in_ca_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
expand_alpha_1x64 (unpack_32_1x64 (d))));
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
pix_multiply_1x64 (
unpack_32_1x64 (s), unpack_32_1x64 (m)),
expand_alpha_1x64 (unpack_32_1x64 (d))));
w--;
}
}
static force_inline void
core_combine_in_reverse_ca_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (d),
pix_multiply_1x64 (unpack_32_1x64 (m),
expand_alpha_1x64 (unpack_32_1x64 (s)))));
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (d),
pix_multiply_1x64 (unpack_32_1x64 (m),
expand_alpha_1x64 (unpack_32_1x64 (s)))));
w--;
}
}
static force_inline void
core_combine_out_ca_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
pix_multiply_1x64 (
unpack_32_1x64 (s), unpack_32_1x64 (m)),
negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
pix_multiply_1x64 (
unpack_32_1x64 (s), unpack_32_1x64 (m)),
negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
w--;
}
}
static force_inline void
core_combine_out_reverse_ca_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (d),
negate_1x64 (pix_multiply_1x64 (
unpack_32_1x64 (m),
expand_alpha_1x64 (unpack_32_1x64 (s))))));
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_mask_lo, &xmm_mask_hi);
negate_2x128 (xmm_mask_lo, xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (d),
negate_1x64 (pix_multiply_1x64 (
unpack_32_1x64 (m),
expand_alpha_1x64 (unpack_32_1x64 (s))))));
w--;
}
}
static force_inline uint32_t
core_combine_atop_ca_pixel_sse2 (uint32_t src,
uint32_t mask,
uint32_t dst)
{
__m64 m = unpack_32_1x64 (mask);
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
__m64 sa = expand_alpha_1x64 (s);
__m64 da = expand_alpha_1x64 (d);
s = pix_multiply_1x64 (s, m);
m = negate_1x64 (pix_multiply_1x64 (m, sa));
return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
}
static force_inline void
core_combine_atop_ca_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_src_lo, &xmm_src_hi);
pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi,
&xmm_mask_lo, &xmm_mask_hi);
negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
pix_add_multiply_2x128 (
&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
w--;
}
}
static force_inline uint32_t
core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
uint32_t mask,
uint32_t dst)
{
__m64 m = unpack_32_1x64 (mask);
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
__m64 da = negate_1x64 (expand_alpha_1x64 (d));
__m64 sa = expand_alpha_1x64 (s);
s = pix_multiply_1x64 (s, m);
m = pix_multiply_1x64 (m, sa);
return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
}
static force_inline void
core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_src_lo, &xmm_src_hi);
pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi,
&xmm_mask_lo, &xmm_mask_hi);
negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
pix_add_multiply_2x128 (
&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
w--;
}
}
static force_inline uint32_t
core_combine_xor_ca_pixel_sse2 (uint32_t src,
uint32_t mask,
uint32_t dst)
{
__m64 a = unpack_32_1x64 (mask);
__m64 s = unpack_32_1x64 (src);
__m64 d = unpack_32_1x64 (dst);
__m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
a, expand_alpha_1x64 (s)));
__m64 dest = pix_multiply_1x64 (s, a);
__m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
return pack_1x64_32 (pix_add_multiply_1x64 (&d,
&alpha_dst,
&dest,
&alpha_src));
}
static force_inline void
core_combine_xor_ca_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
w--;
}
while (w >= 4)
{
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_src_lo, &xmm_src_hi);
pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
&xmm_alpha_src_lo, &xmm_alpha_src_hi,
&xmm_mask_lo, &xmm_mask_hi);
negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
negate_2x128 (xmm_mask_lo, xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
pix_add_multiply_2x128 (
&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
w--;
}
}
static force_inline void
core_combine_add_ca_sse2 (uint32_t * pd,
const uint32_t *ps,
const uint32_t *pm,
int w)
{
uint32_t s, m, d;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
while (w && (unsigned long)pd & 15)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
_mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
unpack_32_1x64 (m)),
unpack_32_1x64 (d)));
w--;
}
while (w >= 4)
{
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_src_lo, &xmm_src_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (
_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
ps += 4;
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
s = *ps++;
m = *pm++;
d = *pd;
*pd++ = pack_1x64_32 (
_mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
unpack_32_1x64 (m)),
unpack_32_1x64 (d)));
w--;
}
}
/* ---------------------------------------------------
* fb_compose_setup_sSE2
*/
static force_inline __m64
create_mask_16_64 (uint16_t mask)
{
return _mm_set1_pi16 (mask);
}
static force_inline __m128i
create_mask_16_128 (uint16_t mask)
{
return _mm_set1_epi16 (mask);
}
static force_inline __m64
create_mask_2x32_64 (uint32_t mask0,
uint32_t mask1)
{
return _mm_set_pi32 (mask0, mask1);
}
/* Work around a code generation bug in Sun Studio 12. */
#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
# define create_mask_2x32_128(mask0, mask1) \
(_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
#else
static force_inline __m128i
create_mask_2x32_128 (uint32_t mask0,
uint32_t mask1)
{
return _mm_set_epi32 (mask0, mask1, mask0, mask1);
}
#endif
/* SSE2 code patch for fbcompose.c */
static void
sse2_combine_over_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_over_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_over_reverse_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_over_reverse_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_in_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_in_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_in_reverse_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_reverse_in_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_out_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_out_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_out_reverse_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_reverse_out_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_atop_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_atop_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_xor_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_xor_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_add_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_add_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_saturate_u (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_saturate_u_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_src_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_src_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_over_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_over_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_in_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_in_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_out_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_out_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_atop_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_atop_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_xor_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_xor_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
static void
sse2_combine_add_ca (pixman_implementation_t *imp,
pixman_op_t op,
uint32_t * dst,
const uint32_t * src,
const uint32_t * mask,
int width)
{
core_combine_add_ca_sse2 (dst, src, mask, width);
_mm_empty ();
}
/* -------------------------------------------------------------------
* composite_over_n_8888
*/
static void
sse2_composite_over_n_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src;
uint32_t *dst_line, *dst, d;
int32_t w;
int dst_stride;
__m128i xmm_src, xmm_alpha;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
d = *dst;
*dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
_mm_movepi64_pi64 (xmm_alpha),
unpack_32_1x64 (d)));
w--;
}
while (w >= 4)
{
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_dst_lo, &xmm_dst_hi);
/* rebuid the 4 pixel data and save*/
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
w -= 4;
dst += 4;
}
while (w)
{
d = *dst;
*dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
_mm_movepi64_pi64 (xmm_alpha),
unpack_32_1x64 (d)));
w--;
}
}
_mm_empty ();
}
/* ---------------------------------------------------------------------
* composite_over_n_0565
*/
static void
sse2_composite_over_n_0565 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src;
uint16_t *dst_line, *dst, d;
int32_t w;
int dst_stride;
__m128i xmm_src, xmm_alpha;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
d = *dst;
*dst++ = pack_565_32_16 (
pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
_mm_movepi64_pi64 (xmm_alpha),
expand565_16_1x64 (d))));
w--;
}
while (w >= 8)
{
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_dst0, &xmm_dst1);
over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_dst2, &xmm_dst3);
xmm_dst = pack_565_4x128_128 (
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
save_128_aligned ((__m128i*)dst, xmm_dst);
dst += 8;
w -= 8;
}
while (w--)
{
d = *dst;
*dst++ = pack_565_32_16 (
pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
_mm_movepi64_pi64 (xmm_alpha),
expand565_16_1x64 (d))));
}
}
_mm_empty ();
}
/* ------------------------------
* composite_add_n_8888_8888_ca
*/
static void
sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src, srca;
uint32_t *dst_line, d;
uint32_t *mask_line, m;
uint32_t pack_cmp;
int dst_stride, mask_stride;
__m128i xmm_src, xmm_alpha;
__m128i xmm_dst;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
srca = src >> 24;
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
xmm_src = _mm_unpacklo_epi8 (
create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
xmm_alpha = expand_alpha_1x128 (xmm_src);
mmx_src = _mm_movepi64_pi64 (xmm_src);
mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
while (height--)
{
int w = width;
const uint32_t *pm = (uint32_t *)mask_line;
uint32_t *pd = (uint32_t *)dst_line;
dst_line += dst_stride;
mask_line += mask_stride;
while (w && (unsigned long)pd & 15)
{
m = *pm++;
if (m)
{
d = *pd;
mmx_mask = unpack_32_1x64 (m);
mmx_dest = unpack_32_1x64 (d);
*pd = pack_1x64_32 (
_mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
}
pd++;
w--;
}
while (w >= 4)
{
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
_mm_movemask_epi8 (
_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
/* if all bits in mask are zero, pack_cmp are equal to 0xffff */
if (pack_cmp != 0xffff)
{
xmm_dst = load_128_aligned ((__m128i*)pd);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
pix_multiply_2x128 (&xmm_src, &xmm_src,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
save_128_aligned (
(__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
}
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
m = *pm++;
if (m)
{
d = *pd;
mmx_mask = unpack_32_1x64 (m);
mmx_dest = unpack_32_1x64 (d);
*pd = pack_1x64_32 (
_mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
}
pd++;
w--;
}
}
_mm_empty ();
}
/* ---------------------------------------------------------------------------
* composite_over_n_8888_8888_ca
*/
static void
sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src;
uint32_t *dst_line, d;
uint32_t *mask_line, m;
uint32_t pack_cmp;
int dst_stride, mask_stride;
__m128i xmm_src, xmm_alpha;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
xmm_src = _mm_unpacklo_epi8 (
create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
xmm_alpha = expand_alpha_1x128 (xmm_src);
mmx_src = _mm_movepi64_pi64 (xmm_src);
mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
while (height--)
{
int w = width;
const uint32_t *pm = (uint32_t *)mask_line;
uint32_t *pd = (uint32_t *)dst_line;
dst_line += dst_stride;
mask_line += mask_stride;
while (w && (unsigned long)pd & 15)
{
m = *pm++;
if (m)
{
d = *pd;
mmx_mask = unpack_32_1x64 (m);
mmx_dest = unpack_32_1x64 (d);
*pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
}
pd++;
w--;
}
while (w >= 4)
{
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
_mm_movemask_epi8 (
_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
/* if all bits in mask are zero, pack_cmp are equal to 0xffff */
if (pack_cmp != 0xffff)
{
xmm_dst = load_128_aligned ((__m128i*)pd);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
in_over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
pd += 4;
pm += 4;
w -= 4;
}
while (w)
{
m = *pm++;
if (m)
{
d = *pd;
mmx_mask = unpack_32_1x64 (m);
mmx_dest = unpack_32_1x64 (d);
*pd = pack_1x64_32 (
in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
}
pd++;
w--;
}
}
_mm_empty ();
}
/*---------------------------------------------------------------------
* composite_over_8888_n_8888
*/
static void
sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
uint32_t mask;
int32_t w;
int dst_stride, src_stride;
__m128i xmm_mask;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
xmm_mask = create_mask_16_128 (mask >> 24);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
uint32_t s = *src++;
uint32_t d = *dst;
__m64 ms = unpack_32_1x64 (s);
__m64 alpha = expand_alpha_1x64 (ms);
__m64 dest = _mm_movepi64_pi64 (xmm_mask);
__m64 alpha_dst = unpack_32_1x64 (d);
*dst++ = pack_1x64_32 (
in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
w--;
}
while (w >= 4)
{
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_mask, &xmm_mask,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
dst += 4;
src += 4;
w -= 4;
}
while (w)
{
uint32_t s = *src++;
uint32_t d = *dst;
__m64 ms = unpack_32_1x64 (s);
__m64 alpha = expand_alpha_1x64 (ms);
__m64 mask = _mm_movepi64_pi64 (xmm_mask);
__m64 dest = unpack_32_1x64 (d);
*dst++ = pack_1x64_32 (
in_over_1x64 (&ms, &alpha, &mask, &dest));
w--;
}
}
_mm_empty ();
}
/*---------------------------------------------------------------------
* composite_over_8888_n_8888
*/
static void
sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
int32_t w;
int dst_stride, src_stride;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
*dst++ = *src++ | 0xff000000;
w--;
}
while (w >= 16)
{
__m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
dst += 16;
src += 16;
w -= 16;
}
while (w)
{
*dst++ = *src++ | 0xff000000;
w--;
}
}
_mm_empty ();
}
/* ---------------------------------------------------------------------
* composite_over_x888_n_8888
*/
static void
sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
uint32_t mask;
int dst_stride, src_stride;
int32_t w;
__m128i xmm_mask, xmm_alpha;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
xmm_mask = create_mask_16_128 (mask >> 24);
xmm_alpha = mask_00ff;
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
uint32_t s = (*src++) | 0xff000000;
uint32_t d = *dst;
__m64 src = unpack_32_1x64 (s);
__m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
__m64 mask = _mm_movepi64_pi64 (xmm_mask);
__m64 dest = unpack_32_1x64 (d);
*dst++ = pack_1x64_32 (
in_over_1x64 (&src, &alpha, &mask, &dest));
w--;
}
while (w >= 4)
{
xmm_src = _mm_or_si128 (
load_128_unaligned ((__m128i*)src), mask_ff000000);
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_alpha, &xmm_alpha,
&xmm_mask, &xmm_mask,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
dst += 4;
src += 4;
w -= 4;
}
while (w)
{
uint32_t s = (*src++) | 0xff000000;
uint32_t d = *dst;
__m64 src = unpack_32_1x64 (s);
__m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
__m64 mask = _mm_movepi64_pi64 (xmm_mask);
__m64 dest = unpack_32_1x64 (d);
*dst++ = pack_1x64_32 (
in_over_1x64 (&src, &alpha, &mask, &dest));
w--;
}
}
_mm_empty ();
}
/* --------------------------------------------------------------------
* composite_over_8888_8888
*/
static void
sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
int dst_stride, src_stride;
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
dst = dst_line;
src = src_line;
while (height--)
{
core_combine_over_u_sse2 (dst, src, NULL, width);
dst += dst_stride;
src += src_stride;
}
_mm_empty ();
}
/* ------------------------------------------------------------------
* composite_over_8888_0565
*/
static force_inline uint16_t
composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
{
__m64 ms;
ms = unpack_32_1x64 (src);
return pack_565_32_16 (
pack_1x64_32 (
over_1x64 (
ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
}
static void
sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint16_t *dst_line, *dst, d;
uint32_t *src_line, *src, s;
int dst_stride, src_stride;
int32_t w;
__m128i xmm_alpha_lo, xmm_alpha_hi;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
#if 0
/* FIXME
*
* I copy the code from MMX one and keep the fixme.
* If it's a problem there, probably is a problem here.
*/
assert (src_image->drawable == mask_image->drawable);
#endif
while (height--)
{
dst = dst_line;
src = src_line;
dst_line += dst_stride;
src_line += src_stride;
w = width;
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)dst & 15))
{
s = *src++;
d = *dst;
*dst++ = composite_over_8888_0565pixel (s, d);
w--;
}
/* It's a 8 pixel loop */
while (w >= 8)
{
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
xmm_src = load_128_unaligned ((__m128i*) src);
xmm_dst = load_128_aligned ((__m128i*) dst);
/* Unpacking */
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
/* I'm loading next 4 pixels from memory
* before to optimze the memory read.
*/
xmm_src = load_128_unaligned ((__m128i*) (src + 4));
over_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_dst0, &xmm_dst1);
/* Unpacking */
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi);
over_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_dst2, &xmm_dst3);
save_128_aligned (
(__m128i*)dst, pack_565_4x128_128 (
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
w -= 8;
dst += 8;
src += 8;
}
while (w--)
{
s = *src++;
d = *dst;
*dst++ = composite_over_8888_0565pixel (s, d);
}
}
_mm_empty ();
}
/* -----------------------------------------------------------------
* composite_over_n_8_8888
*/
static void
sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src, srca;
uint32_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
int32_t w;
uint32_t m, d;
__m128i xmm_src, xmm_alpha, xmm_def;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
srca = src >> 24;
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
xmm_def = create_mask_2x32_128 (src, src);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
mmx_src = _mm_movepi64_pi64 (xmm_src);
mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
if (m)
{
d = *dst;
mmx_mask = expand_pixel_8_1x64 (m);
mmx_dest = unpack_32_1x64 (d);
*dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
}
w--;
dst++;
}
while (w >= 4)
{
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
{
save_128_aligned ((__m128i*)dst, xmm_def);
}
else if (m)
{
xmm_dst = load_128_aligned ((__m128i*) dst);
xmm_mask = unpack_32_1x128 (m);
xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
/* Unpacking */
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
in_over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
w -= 4;
dst += 4;
mask += 4;
}
while (w)
{
uint8_t m = *mask++;
if (m)
{
d = *dst;
mmx_mask = expand_pixel_8_1x64 (m);
mmx_dest = unpack_32_1x64 (d);
*dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
&mmx_alpha,
&mmx_mask,
&mmx_dest));
}
w--;
dst++;
}
}
_mm_empty ();
}
/* ----------------------------------------------------------------
* composite_over_n_8_8888
*/
pixman_bool_t
pixman_fill_sse2 (uint32_t *bits,
int stride,
int bpp,
int x,
int y,
int width,
int height,
uint32_t data)
{
uint32_t byte_width;
uint8_t *byte_line;
__m128i xmm_def;
if (bpp == 8)
{
uint8_t b;
uint16_t w;
stride = stride * (int) sizeof (uint32_t) / 1;
byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
byte_width = width;
stride *= 1;
b = data & 0xff;
w = (b << 8) | b;
data = (w << 16) | w;
}
else if (bpp == 16)
{
stride = stride * (int) sizeof (uint32_t) / 2;
byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
byte_width = 2 * width;
stride *= 2;
data = (data & 0xffff) * 0x00010001;
}
else if (bpp == 32)
{
stride = stride * (int) sizeof (uint32_t) / 4;
byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
byte_width = 4 * width;
stride *= 4;
}
else
{
return FALSE;
}
xmm_def = create_mask_2x32_128 (data, data);
while (height--)
{
int w;
uint8_t *d = byte_line;
byte_line += stride;
w = byte_width;
while (w >= 1 && ((unsigned long)d & 1))
{
*(uint8_t *)d = data;
w -= 1;
d += 1;
}
while (w >= 2 && ((unsigned long)d & 3))
{
*(uint16_t *)d = data;
w -= 2;
d += 2;
}
while (w >= 4 && ((unsigned long)d & 15))
{
*(uint32_t *)d = data;
w -= 4;
d += 4;
}
while (w >= 128)
{
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
save_128_aligned ((__m128i*)(d + 48), xmm_def);
save_128_aligned ((__m128i*)(d + 64), xmm_def);
save_128_aligned ((__m128i*)(d + 80), xmm_def);
save_128_aligned ((__m128i*)(d + 96), xmm_def);
save_128_aligned ((__m128i*)(d + 112), xmm_def);
d += 128;
w -= 128;
}
if (w >= 64)
{
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
save_128_aligned ((__m128i*)(d + 48), xmm_def);
d += 64;
w -= 64;
}
if (w >= 32)
{
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
d += 32;
w -= 32;
}
if (w >= 16)
{
save_128_aligned ((__m128i*)(d), xmm_def);
d += 16;
w -= 16;
}
while (w >= 4)
{
*(uint32_t *)d = data;
w -= 4;
d += 4;
}
if (w >= 2)
{
*(uint16_t *)d = data;
w -= 2;
d += 2;
}
if (w >= 1)
{
*(uint8_t *)d = data;
w -= 1;
d += 1;
}
}
_mm_empty ();
return TRUE;
}
static void
sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src, srca;
uint32_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
int32_t w;
uint32_t m;
__m128i xmm_src, xmm_def;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
srca = src >> 24;
if (src == 0)
{
pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
PIXMAN_FORMAT_BPP (dst_image->bits.format),
dest_x, dest_y, width, height, 0);
return;
}
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
xmm_def = create_mask_2x32_128 (src, src);
xmm_src = expand_pixel_32_1x128 (src);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
if (m)
{
*dst = pack_1x64_32 (
pix_multiply_1x64 (
_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
}
else
{
*dst = 0;
}
w--;
dst++;
}
while (w >= 4)
{
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
{
save_128_aligned ((__m128i*)dst, xmm_def);
}
else if (m)
{
xmm_mask = unpack_32_1x128 (m);
xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
/* Unpacking */
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
pix_multiply_2x128 (&xmm_src, &xmm_src,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
}
else
{
save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
}
w -= 4;
dst += 4;
mask += 4;
}
while (w)
{
uint8_t m = *mask++;
if (m)
{
*dst = pack_1x64_32 (
pix_multiply_1x64 (
_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
}
else
{
*dst = 0;
}
w--;
dst++;
}
}
_mm_empty ();
}
/*-----------------------------------------------------------------------
* composite_over_n_8_0565
*/
static void
sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src, srca;
uint16_t *dst_line, *dst, d;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
int32_t w;
uint32_t m;
__m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
__m128i xmm_src, xmm_alpha;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
srca = src >> 24;
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
mmx_src = _mm_movepi64_pi64 (xmm_src);
mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
m = *mask++;
if (m)
{
d = *dst;
mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
mmx_dest = expand565_16_1x64 (d);
*dst = pack_565_32_16 (
pack_1x64_32 (
in_over_1x64 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
w--;
dst++;
}
while (w >= 8)
{
xmm_dst = load_128_aligned ((__m128i*) dst);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
m = *((uint32_t*)mask);
mask += 4;
if (m)
{
xmm_mask = unpack_32_1x128 (m);
xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
/* Unpacking */
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
in_over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst0, &xmm_dst1);
}
m = *((uint32_t*)mask);
mask += 4;
if (m)
{
xmm_mask = unpack_32_1x128 (m);
xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
/* Unpacking */
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
in_over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst2, &xmm_dst3);
}
save_128_aligned (
(__m128i*)dst, pack_565_4x128_128 (
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
w -= 8;
dst += 8;
}
while (w)
{
m = *mask++;
if (m)
{
d = *dst;
mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
mmx_dest = expand565_16_1x64 (d);
*dst = pack_565_32_16 (
pack_1x64_32 (
in_over_1x64 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
w--;
dst++;
}
}
_mm_empty ();
}
/* -----------------------------------------------------------------------
* composite_over_pixbuf_0565
*/
static void
sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint16_t *dst_line, *dst, d;
uint32_t *src_line, *src, s;
int dst_stride, src_stride;
int32_t w;
uint32_t opaque, zero;
__m64 ms;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
#if 0
/* FIXME
*
* I copy the code from MMX one and keep the fixme.
* If it's a problem there, probably is a problem here.
*/
assert (src_image->drawable == mask_image->drawable);
#endif
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
s = *src++;
d = *dst;
ms = unpack_32_1x64 (s);
*dst++ = pack_565_32_16 (
pack_1x64_32 (
over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
w--;
}
while (w >= 8)
{
/* First round */
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
opaque = is_opaque (xmm_src);
zero = is_zero (xmm_src);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
/* preload next round*/
xmm_src = load_128_unaligned ((__m128i*)(src + 4));
if (opaque)
{
invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_dst0, &xmm_dst1);
}
else if (!zero)
{
over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_dst0, &xmm_dst1);
}
/* Second round */
opaque = is_opaque (xmm_src);
zero = is_zero (xmm_src);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
if (opaque)
{
invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_dst2, &xmm_dst3);
}
else if (!zero)
{
over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_dst2, &xmm_dst3);
}
save_128_aligned (
(__m128i*)dst, pack_565_4x128_128 (
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
w -= 8;
src += 8;
dst += 8;
}
while (w)
{
s = *src++;
d = *dst;
ms = unpack_32_1x64 (s);
*dst++ = pack_565_32_16 (
pack_1x64_32 (
over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
w--;
}
}
_mm_empty ();
}
/* -------------------------------------------------------------------------
* composite_over_pixbuf_8888
*/
static void
sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *dst_line, *dst, d;
uint32_t *src_line, *src, s;
int dst_stride, src_stride;
int32_t w;
uint32_t opaque, zero;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
#if 0
/* FIXME
*
* I copy the code from MMX one and keep the fixme.
* If it's a problem there, probably is a problem here.
*/
assert (src_image->drawable == mask_image->drawable);
#endif
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
s = *src++;
d = *dst;
*dst++ = pack_1x64_32 (
over_rev_non_pre_1x64 (
unpack_32_1x64 (s), unpack_32_1x64 (d)));
w--;
}
while (w >= 4)
{
xmm_src_hi = load_128_unaligned ((__m128i*)src);
opaque = is_opaque (xmm_src_hi);
zero = is_zero (xmm_src_hi);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
if (opaque)
{
invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
else if (!zero)
{
xmm_dst_hi = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
w -= 4;
dst += 4;
src += 4;
}
while (w)
{
s = *src++;
d = *dst;
*dst++ = pack_1x64_32 (
over_rev_non_pre_1x64 (
unpack_32_1x64 (s), unpack_32_1x64 (d)));
w--;
}
}
_mm_empty ();
}
/* -------------------------------------------------------------------------------------------------
* composite_over_n_8888_0565_ca
*/
static void
sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src;
uint16_t *dst_line, *dst, d;
uint32_t *mask_line, *mask, m;
int dst_stride, mask_stride;
int w;
uint32_t pack_cmp;
__m128i xmm_src, xmm_alpha;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
__m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
xmm_alpha = expand_alpha_1x128 (xmm_src);
mmx_src = _mm_movepi64_pi64 (xmm_src);
mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
while (height--)
{
w = width;
mask = mask_line;
dst = dst_line;
mask_line += mask_stride;
dst_line += dst_stride;
while (w && ((unsigned long)dst & 15))
{
m = *(uint32_t *) mask;
if (m)
{
d = *dst;
mmx_mask = unpack_32_1x64 (m);
mmx_dest = expand565_16_1x64 (d);
*dst = pack_565_32_16 (
pack_1x64_32 (
in_over_1x64 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
w--;
dst++;
mask++;
}
while (w >= 8)
{
/* First round */
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
pack_cmp = _mm_movemask_epi8 (
_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
/* preload next round */
xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
/* preload next round */
if (pack_cmp != 0xffff)
{
in_over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst0, &xmm_dst1);
}
/* Second round */
pack_cmp = _mm_movemask_epi8 (
_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
if (pack_cmp != 0xffff)
{
in_over_2x128 (&xmm_src, &xmm_src,
&xmm_alpha, &xmm_alpha,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst2, &xmm_dst3);
}
save_128_aligned (
(__m128i*)dst, pack_565_4x128_128 (
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
w -= 8;
dst += 8;
mask += 8;
}
while (w)
{
m = *(uint32_t *) mask;
if (m)
{
d = *dst;
mmx_mask = unpack_32_1x64 (m);
mmx_dest = expand565_16_1x64 (d);
*dst = pack_565_32_16 (
pack_1x64_32 (
in_over_1x64 (
&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
}
w--;
dst++;
mask++;
}
}
_mm_empty ();
}
/* -----------------------------------------------------------------------
* composite_in_n_8_8
*/
static void
sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint8_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
uint32_t d, m;
uint32_t src;
uint8_t sa;
int32_t w;
__m128i xmm_alpha;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
sa = src >> 24;
xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x64_32 (
pix_multiply_1x64 (
pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
unpack_32_1x64 (m)),
unpack_32_1x64 (d)));
w--;
}
while (w >= 16)
{
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
&xmm_dst_lo, &xmm_dst_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
mask += 16;
dst += 16;
w -= 16;
}
while (w)
{
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x64_32 (
pix_multiply_1x64 (
pix_multiply_1x64 (
_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
unpack_32_1x64 (d)));
w--;
}
}
_mm_empty ();
}
/* -----------------------------------------------------------------------
* composite_in_n_8
*/
static void
sse2_composite_in_n_8 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint8_t *dst_line, *dst;
int dst_stride;
uint32_t d;
uint32_t src;
int32_t w;
__m128i xmm_alpha;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
src = src >> 24;
if (src == 0xff)
return;
if (src == 0x00)
{
pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
8, dest_x, dest_y, width, height, src);
return;
}
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
w = width;
while (w && ((unsigned long)dst & 15))
{
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x64_32 (
pix_multiply_1x64 (
_mm_movepi64_pi64 (xmm_alpha),
unpack_32_1x64 (d)));
w--;
}
while (w >= 16)
{
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
&xmm_dst_lo, &xmm_dst_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
dst += 16;
w -= 16;
}
while (w)
{
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x64_32 (
pix_multiply_1x64 (
_mm_movepi64_pi64 (xmm_alpha),
unpack_32_1x64 (d)));
w--;
}
}
_mm_empty ();
}
/* ---------------------------------------------------------------------------
* composite_in_8_8
*/
static void
sse2_composite_in_8_8 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint8_t *dst_line, *dst;
uint8_t *src_line, *src;
int src_stride, dst_stride;
int32_t w;
uint32_t s, d;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
w = width;
while (w && ((unsigned long)dst & 15))
{
s = (uint32_t) *src++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x64_32 (
pix_multiply_1x64 (
unpack_32_1x64 (s), unpack_32_1x64 (d)));
w--;
}
while (w >= 16)
{
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_dst_lo, &xmm_dst_hi,
&xmm_dst_lo, &xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
src += 16;
dst += 16;
w -= 16;
}
while (w)
{
s = (uint32_t) *src++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x64_32 (
pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
w--;
}
}
_mm_empty ();
}
/* -------------------------------------------------------------------------
* composite_add_n_8_8
*/
static void
sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint8_t *dst_line, *dst;
uint8_t *mask_line, *mask;
int dst_stride, mask_stride;
int32_t w;
uint32_t src;
uint8_t sa;
uint32_t m, d;
__m128i xmm_alpha;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
sa = src >> 24;
xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x64_32 (
_mm_adds_pu16 (
pix_multiply_1x64 (
_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
unpack_32_1x64 (d)));
w--;
}
while (w >= 16)
{
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
&xmm_mask_lo, &xmm_mask_hi,
&xmm_mask_lo, &xmm_mask_hi);
xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
mask += 16;
dst += 16;
w -= 16;
}
while (w)
{
m = (uint32_t) *mask++;
d = (uint32_t) *dst;
*dst++ = (uint8_t) pack_1x64_32 (
_mm_adds_pu16 (
pix_multiply_1x64 (
_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
unpack_32_1x64 (d)));
w--;
}
}
_mm_empty ();
}
/* -------------------------------------------------------------------------
* composite_add_n_8_8
*/
static void
sse2_composite_add_n_8 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint8_t *dst_line, *dst;
int dst_stride;
int32_t w;
uint32_t src;
__m128i xmm_src;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
src >>= 24;
if (src == 0x00)
return;
if (src == 0xff)
{
pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
8, dest_x, dest_y, width, height, 0xff);
return;
}
src = (src << 24) | (src << 16) | (src << 8) | src;
xmm_src = _mm_set_epi32 (src, src, src, src);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
w = width;
while (w && ((unsigned long)dst & 15))
{
*dst = (uint8_t)_mm_cvtsi64_si32 (
_mm_adds_pu8 (
_mm_movepi64_pi64 (xmm_src),
_mm_cvtsi32_si64 (*dst)));
w--;
dst++;
}
while (w >= 16)
{
save_128_aligned (
(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
dst += 16;
w -= 16;
}
while (w)
{
*dst = (uint8_t)_mm_cvtsi64_si32 (
_mm_adds_pu8 (
_mm_movepi64_pi64 (xmm_src),
_mm_cvtsi32_si64 (*dst)));
w--;
dst++;
}
}
_mm_empty ();
}
/* ----------------------------------------------------------------------
* composite_add_8_8
*/
static void
sse2_composite_add_8_8 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint8_t *dst_line, *dst;
uint8_t *src_line, *src;
int dst_stride, src_stride;
int32_t w;
uint16_t t;
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
while (height--)
{
dst = dst_line;
src = src_line;
dst_line += dst_stride;
src_line += src_stride;
w = width;
/* Small head */
while (w && (unsigned long)dst & 3)
{
t = (*dst) + (*src++);
*dst++ = t | (0 - (t >> 8));
w--;
}
core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
/* Small tail */
dst += w & 0xfffc;
src += w & 0xfffc;
w &= 3;
while (w)
{
t = (*dst) + (*src++);
*dst++ = t | (0 - (t >> 8));
w--;
}
}
_mm_empty ();
}
/* ---------------------------------------------------------------------
* composite_add_8888_8888
*/
static void
sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *dst_line, *dst;
uint32_t *src_line, *src;
int dst_stride, src_stride;
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
src = src_line;
src_line += src_stride;
core_combine_add_u_sse2 (dst, src, NULL, width);
}
_mm_empty ();
}
/* -------------------------------------------------------------------------------------------------
* sse2_composite_copy_area
*/
static pixman_bool_t
pixman_blt_sse2 (uint32_t *src_bits,
uint32_t *dst_bits,
int src_stride,
int dst_stride,
int src_bpp,
int dst_bpp,
int src_x,
int src_y,
int dst_x,
int dst_y,
int width,
int height)
{
uint8_t * src_bytes;
uint8_t * dst_bytes;
int byte_width;
if (src_bpp != dst_bpp)
return FALSE;
if (src_bpp == 16)
{
src_stride = src_stride * (int) sizeof (uint32_t) / 2;
dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
byte_width = 2 * width;
src_stride *= 2;
dst_stride *= 2;
}
else if (src_bpp == 32)
{
src_stride = src_stride * (int) sizeof (uint32_t) / 4;
dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
byte_width = 4 * width;
src_stride *= 4;
dst_stride *= 4;
}
else
{
return FALSE;
}
while (height--)
{
int w;
uint8_t *s = src_bytes;
uint8_t *d = dst_bytes;
src_bytes += src_stride;
dst_bytes += dst_stride;
w = byte_width;
while (w >= 2 && ((unsigned long)d & 3))
{
*(uint16_t *)d = *(uint16_t *)s;
w -= 2;
s += 2;
d += 2;
}
while (w >= 4 && ((unsigned long)d & 15))
{
*(uint32_t *)d = *(uint32_t *)s;
w -= 4;
s += 4;
d += 4;
}
while (w >= 64)
{
__m128i xmm0, xmm1, xmm2, xmm3;
xmm0 = load_128_unaligned ((__m128i*)(s));
xmm1 = load_128_unaligned ((__m128i*)(s + 16));
xmm2 = load_128_unaligned ((__m128i*)(s + 32));
xmm3 = load_128_unaligned ((__m128i*)(s + 48));
save_128_aligned ((__m128i*)(d), xmm0);
save_128_aligned ((__m128i*)(d + 16), xmm1);
save_128_aligned ((__m128i*)(d + 32), xmm2);
save_128_aligned ((__m128i*)(d + 48), xmm3);
s += 64;
d += 64;
w -= 64;
}
while (w >= 16)
{
save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
w -= 16;
d += 16;
s += 16;
}
while (w >= 4)
{
*(uint32_t *)d = *(uint32_t *)s;
w -= 4;
s += 4;
d += 4;
}
if (w >= 2)
{
*(uint16_t *)d = *(uint16_t *)s;
w -= 2;
s += 2;
d += 2;
}
}
_mm_empty ();
return TRUE;
}
static void
sse2_composite_copy_area (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
pixman_blt_sse2 (src_image->bits.bits,
dst_image->bits.bits,
src_image->bits.rowstride,
dst_image->bits.rowstride,
PIXMAN_FORMAT_BPP (src_image->bits.format),
PIXMAN_FORMAT_BPP (dst_image->bits.format),
src_x, src_y, dest_x, dest_y, width, height);
}
static void
sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *src, *src_line, s;
uint32_t *dst, *dst_line, d;
uint8_t *mask, *mask_line;
uint32_t m;
int src_stride, mask_stride, dst_stride;
int32_t w;
__m64 ms;
__m128i xmm_src, xmm_src_lo, xmm_src_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
src = src_line;
src_line += src_stride;
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
s = 0xff000000 | *src++;
m = (uint32_t) *mask++;
d = *dst;
ms = unpack_32_1x64 (s);
if (m != 0xff)
{
__m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
__m64 md = unpack_32_1x64 (d);
ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
}
*dst++ = pack_1x64_32 (ms);
w--;
}
while (w >= 4)
{
m = *(uint32_t*) mask;
xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
if (m == 0xffffffff)
{
save_128_aligned ((__m128i*)dst, xmm_src);
}
else
{
xmm_dst = load_128_aligned ((__m128i*)dst);
xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
src += 4;
dst += 4;
mask += 4;
w -= 4;
}
while (w)
{
m = (uint32_t) *mask++;
if (m)
{
s = 0xff000000 | *src;
if (m == 0xff)
{
*dst = s;
}
else
{
__m64 ma, md, ms;
d = *dst;
ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
md = unpack_32_1x64 (d);
ms = unpack_32_1x64 (s);
*dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
}
}
src++;
dst++;
w--;
}
}
_mm_empty ();
}
static void
sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *src, *src_line, s;
uint32_t *dst, *dst_line, d;
uint8_t *mask, *mask_line;
uint32_t m;
int src_stride, mask_stride, dst_stride;
int32_t w;
__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
src = src_line;
src_line += src_stride;
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
s = *src++;
m = (uint32_t) *mask++;
d = *dst;
sa = s >> 24;
if (m)
{
if (sa == 0xff && m == 0xff)
{
*dst = s;
}
else
{
__m64 ms, md, ma, msa;
ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
ms = unpack_32_1x64 (s);
md = unpack_32_1x64 (d);
msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
*dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
}
}
dst++;
w--;
}
while (w >= 4)
{
m = *(uint32_t *) mask;
if (m)
{
xmm_src = load_128_unaligned ((__m128i*)src);
if (m == 0xffffffff && is_opaque (xmm_src))
{
save_128_aligned ((__m128i *)dst, xmm_src);
}
else
{
xmm_dst = load_128_aligned ((__m128i *)dst);
xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
}
src += 4;
dst += 4;
mask += 4;
w -= 4;
}
while (w)
{
uint32_t sa;
s = *src++;
m = (uint32_t) *mask++;
d = *dst;
sa = s >> 24;
if (m)
{
if (sa == 0xff && m == 0xff)
{
*dst = s;
}
else
{
__m64 ms, md, ma, msa;
ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
ms = unpack_32_1x64 (s);
md = unpack_32_1x64 (d);
msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
*dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
}
}
dst++;
w--;
}
}
_mm_empty ();
}
static void
sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t src;
uint32_t *dst_line, *dst;
__m128i xmm_src;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_dsta_hi, xmm_dsta_lo;
int dst_stride;
int32_t w;
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
if (src == 0)
return;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
xmm_src = expand_pixel_32_1x128 (src);
while (height--)
{
dst = dst_line;
dst_line += dst_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
__m64 vd;
vd = unpack_32_1x64 (*dst);
*dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
_mm_movepi64_pi64 (xmm_src)));
w--;
dst++;
}
while (w >= 4)
{
__m128i tmp_lo, tmp_hi;
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
tmp_lo = xmm_src;
tmp_hi = xmm_src;
over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
&xmm_dsta_lo, &xmm_dsta_hi,
&tmp_lo, &tmp_hi);
save_128_aligned (
(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
w -= 4;
dst += 4;
}
while (w)
{
__m64 vd;
vd = unpack_32_1x64 (*dst);
*dst = pack_1x64_32 (over_1x64 (vd, expand_alpha_1x64 (vd),
_mm_movepi64_pi64 (xmm_src)));
w--;
dst++;
}
}
_mm_empty ();
}
static void
sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
pixman_op_t op,
pixman_image_t * src_image,
pixman_image_t * mask_image,
pixman_image_t * dst_image,
int32_t src_x,
int32_t src_y,
int32_t mask_x,
int32_t mask_y,
int32_t dest_x,
int32_t dest_y,
int32_t width,
int32_t height)
{
uint32_t *src, *src_line, s;
uint32_t *dst, *dst_line, d;
uint32_t *mask, *mask_line;
uint32_t m;
int src_stride, mask_stride, dst_stride;
int32_t w;
__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
PIXMAN_IMAGE_GET_LINE (
dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
PIXMAN_IMAGE_GET_LINE (
mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
PIXMAN_IMAGE_GET_LINE (
src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
while (height--)
{
src = src_line;
src_line += src_stride;
dst = dst_line;
dst_line += dst_stride;
mask = mask_line;
mask_line += mask_stride;
w = width;
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
s = *src++;
m = (*mask++) >> 24;
d = *dst;
sa = s >> 24;
if (m)
{
if (sa == 0xff && m == 0xff)
{
*dst = s;
}
else
{
__m64 ms, md, ma, msa;
ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
ms = unpack_32_1x64 (s);
md = unpack_32_1x64 (d);
msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
*dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
}
}
dst++;
w--;
}
while (w >= 4)
{
xmm_mask = load_128_unaligned ((__m128i*)mask);
if (!is_transparent (xmm_mask))
{
xmm_src = load_128_unaligned ((__m128i*)src);
if (is_opaque (xmm_mask) && is_opaque (xmm_src))
{
save_128_aligned ((__m128i *)dst, xmm_src);
}
else
{
xmm_dst = load_128_aligned ((__m128i *)dst);
unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
}
src += 4;
dst += 4;
mask += 4;
w -= 4;
}
while (w)
{
uint32_t sa;
s = *src++;
m = (*mask++) >> 24;
d = *dst;
sa = s >> 24;
if (m)
{
if (sa == 0xff && m == 0xff)
{
*dst = s;
}
else
{
__m64 ms, md, ma, msa;
ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
ms = unpack_32_1x64 (s);
md = unpack_32_1x64 (d);
msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
*dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
}
}
dst++;
w--;
}
}
_mm_empty ();
}
/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
static force_inline void
scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
const uint32_t* ps,
int32_t w,
pixman_fixed_t vx,
pixman_fixed_t unit_x,
pixman_fixed_t max_vx)
{
uint32_t s, d;
const uint32_t* pm = NULL;
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
{
d = *pd;
s = combine1 (ps + (vx >> 16), pm);
vx += unit_x;
*pd++ = core_combine_over_u_pixel_sse2 (s, d);
if (pm)
pm++;
w--;
}
while (w >= 4)
{
__m128i tmp;
uint32_t tmp1, tmp2, tmp3, tmp4;
tmp1 = ps[vx >> 16];
vx += unit_x;
tmp2 = ps[vx >> 16];
vx += unit_x;
tmp3 = ps[vx >> 16];
vx += unit_x;
tmp4 = ps[vx >> 16];
vx += unit_x;
tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
if (is_opaque (xmm_src_hi))
{
save_128_aligned ((__m128i*)pd, xmm_src_hi);
}
else if (!is_zero (xmm_src_hi))
{
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
expand_alpha_2x128 (
xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
over_2x128 (&xmm_src_lo, &xmm_src_hi,
&xmm_alpha_lo, &xmm_alpha_hi,
&xmm_dst_lo, &xmm_dst_hi);
/* rebuid the 4 pixel data and save*/
save_128_aligned ((__m128i*)pd,
pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
}
w -= 4;
pd += 4;
if (pm)
pm += 4;
}
while (w)
{
d = *pd;
s = combine1 (ps + (vx >> 16), pm);
vx += unit_x;
*pd++ = core_combine_over_u_pixel_sse2 (s, d);
if (pm)
pm++;
w--;
}
_mm_empty ();
}
FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
scaled_nearest_scanline_sse2_8888_8888_OVER,
uint32_t, uint32_t, COVER);
FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
scaled_nearest_scanline_sse2_8888_8888_OVER,
uint32_t, uint32_t, NONE);
FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
scaled_nearest_scanline_sse2_8888_8888_OVER,
uint32_t, uint32_t, PAD);
static const pixman_fast_path_t sse2_fast_paths[] =
{
/* PIXMAN_OP_OVER */
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
/* PIXMAN_OP_OVER_REVERSE */
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
/* PIXMAN_OP_ADD */
PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
/* PIXMAN_OP_SRC */
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
/* PIXMAN_OP_IN */
PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
{ PIXMAN_OP_NONE },
};
static pixman_bool_t
sse2_blt (pixman_implementation_t *imp,
uint32_t * src_bits,
uint32_t * dst_bits,
int src_stride,
int dst_stride,
int src_bpp,
int dst_bpp,
int src_x,
int src_y,
int dst_x,
int dst_y,
int width,
int height)
{
if (!pixman_blt_sse2 (
src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
src_x, src_y, dst_x, dst_y, width, height))
{
return _pixman_implementation_blt (
imp->delegate,
src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
src_x, src_y, dst_x, dst_y, width, height);
}
return TRUE;
}
#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
__attribute__((__force_align_arg_pointer__))
#endif
static pixman_bool_t
sse2_fill (pixman_implementation_t *imp,
uint32_t * bits,
int stride,
int bpp,
int x,
int y,
int width,
int height,
uint32_t xor)
{
if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
{
return _pixman_implementation_fill (
imp->delegate, bits, stride, bpp, x, y, width, height, xor);
}
return TRUE;
}
#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
__attribute__((__force_align_arg_pointer__))
#endif
pixman_implementation_t *
_pixman_implementation_create_sse2 (void)
{
#ifdef USE_MMX
pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
#else
pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
#endif
pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
/* SSE2 constants */
mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
mask_0080 = create_mask_16_128 (0x0080);
mask_00ff = create_mask_16_128 (0x00ff);
mask_0101 = create_mask_16_128 (0x0101);
mask_ffff = create_mask_16_128 (0xffff);
mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
/* MMX constants */
mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
mask_x0080 = create_mask_16_64 (0x0080);
mask_x00ff = create_mask_16_64 (0x00ff);
mask_x0101 = create_mask_16_64 (0x0101);
mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
_mm_empty ();
/* Set up function pointers */
/* SSE code patch for fbcompose.c */
imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
imp->blt = sse2_blt;
imp->fill = sse2_fill;
return imp;
}
#endif /* USE_SSE2 */