| /* |
| * Copyright © 2008 Rodrigo Kumpera |
| * Copyright © 2008 André Tupinambá |
| * |
| * Permission to use, copy, modify, distribute, and sell this software and its |
| * documentation for any purpose is hereby granted without fee, provided that |
| * the above copyright notice appear in all copies and that both that |
| * copyright notice and this permission notice appear in supporting |
| * documentation, and that the name of Red Hat not be used in advertising or |
| * publicity pertaining to distribution of the software without specific, |
| * written prior permission. Red Hat makes no representations about the |
| * suitability of this software for any purpose. It is provided "as is" |
| * without express or implied warranty. |
| * |
| * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
| * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
| * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
| * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
| * SOFTWARE. |
| * |
| * Author: Rodrigo Kumpera (kumpera@gmail.com) |
| * André Tupinambá (andrelrt@gmail.com) |
| * |
| * Based on work by Owen Taylor and Søren Sandmann |
| */ |
| #ifdef HAVE_CONFIG_H |
| #include <config.h> |
| #endif |
| |
| #include <mmintrin.h> |
| #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ |
| #include <emmintrin.h> /* for SSE2 intrinsics */ |
| |
| #include "pixman-sse.h" |
| |
| #ifdef USE_SSE2 |
| |
| #ifdef _MSC_VER |
| #undef inline |
| #define inline __forceinline |
| #endif |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * Locals |
| */ |
| |
| static __m64 xMask0080; |
| static __m64 xMask00ff; |
| static __m64 xMask0101; |
| static __m64 xMaskAlpha; |
| |
| static __m64 xMask565rgb; |
| static __m64 xMask565Unpack; |
| |
| static __m128i Mask0080; |
| static __m128i Mask00ff; |
| static __m128i Mask0101; |
| static __m128i Maskffff; |
| static __m128i Maskff000000; |
| static __m128i MaskAlpha; |
| |
| static __m128i Mask565r; |
| static __m128i Mask565g1, Mask565g2; |
| static __m128i Mask565b; |
| static __m128i MaskRed; |
| static __m128i MaskGreen; |
| static __m128i MaskBlue; |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * SSE2 Inlines |
| */ |
| static inline __m128i |
| unpack_32_1x128 (uint32_t data) |
| { |
| return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128()); |
| } |
| |
| static inline void |
| unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi) |
| { |
| *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); |
| *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); |
| } |
| |
| static inline void |
| unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3) |
| { |
| __m128i lo, hi; |
| __m128i r, g, b; |
| |
| lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); |
| hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); |
| |
| r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed); |
| g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen); |
| b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue); |
| |
| lo = _mm_or_si128 (_mm_or_si128 (r, g), b); |
| |
| r = _mm_and_si128 (_mm_slli_epi32 (hi, 8), MaskRed); |
| g = _mm_and_si128 (_mm_slli_epi32 (hi, 5), MaskGreen); |
| b = _mm_and_si128 (_mm_slli_epi32 (hi, 3), MaskBlue); |
| |
| hi = _mm_or_si128 (_mm_or_si128 (r, g), b); |
| |
| unpack_128_2x128 (lo, data0, data1); |
| unpack_128_2x128 (hi, data2, data3); |
| } |
| |
| static inline uint16_t |
| pack565_32_16 (uint32_t pixel) |
| { |
| return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f)); |
| } |
| |
| static inline __m128i |
| pack_2x128_128 (__m128i lo, __m128i hi) |
| { |
| return _mm_packus_epi16 (lo, hi); |
| } |
| |
| static inline __m128i |
| pack565_2x128_128 (__m128i lo, __m128i hi) |
| { |
| __m128i data; |
| __m128i r, g1, g2, b; |
| |
| data = pack_2x128_128 ( lo, hi ); |
| |
| r = _mm_and_si128 (data , Mask565r); |
| g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1); |
| g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2); |
| b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b); |
| |
| return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); |
| } |
| |
| static inline __m128i |
| pack565_4x128_128 (__m128i xmm0, __m128i xmm1, __m128i xmm2, __m128i xmm3) |
| { |
| __m128i lo, hi; |
| |
| lo = _mm_packus_epi16 (pack565_2x128_128 ( xmm0, xmm1 ), _mm_setzero_si128 ()); |
| hi = _mm_packus_epi16 (_mm_setzero_si128 (), pack565_2x128_128 ( xmm2, xmm3 )); |
| |
| return _mm_or_si128 (lo, hi); |
| } |
| |
| static inline uint32_t |
| packAlpha (__m128i x) |
| { |
| return _mm_cvtsi128_si32 (_mm_packus_epi16 (_mm_packus_epi16 (_mm_srli_epi32 (x, 24), |
| _mm_setzero_si128 ()), |
| _mm_setzero_si128 ())); |
| } |
| |
| static inline __m128i |
| expandPixel_32_1x128 (uint32_t data) |
| { |
| return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0)); |
| } |
| |
| static inline __m128i |
| expandAlpha_1x128 (__m128i data) |
| { |
| return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)); |
| } |
| |
| static inline void |
| expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi) |
| { |
| __m128i lo, hi; |
| |
| lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3)); |
| hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3)); |
| *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3)); |
| *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3)); |
| } |
| |
| static inline void |
| expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi) |
| { |
| __m128i lo, hi; |
| |
| lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0)); |
| hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0)); |
| *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0)); |
| *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0)); |
| } |
| |
| static inline void |
| pixMultiply_2x128 (__m128i dataLo, __m128i dataHi, __m128i alphaLo, __m128i alphaHi, __m128i* retLo, __m128i* retHi) |
| { |
| __m128i lo, hi; |
| |
| lo = _mm_mullo_epi16 (dataLo, alphaLo); |
| hi = _mm_mullo_epi16 (dataHi, alphaHi); |
| lo = _mm_adds_epu16 (lo, Mask0080); |
| hi = _mm_adds_epu16 (hi, Mask0080); |
| *retLo = _mm_mulhi_epu16 (lo, Mask0101); |
| *retHi = _mm_mulhi_epu16 (hi, Mask0101); |
| } |
| |
| static inline void |
| pixAddMultiply_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaDstLo, __m128i alphaDstHi, |
| __m128i dstLo, __m128i dstHi, __m128i alphaSrcLo, __m128i alphaSrcHi, |
| __m128i* retLo, __m128i* retHi) |
| { |
| __m128i lo, hi; |
| __m128i mulLo, mulHi; |
| |
| lo = _mm_mullo_epi16 (srcLo, alphaDstLo); |
| hi = _mm_mullo_epi16 (srcHi, alphaDstHi); |
| mulLo = _mm_mullo_epi16 (dstLo, alphaSrcLo); |
| mulHi = _mm_mullo_epi16 (dstHi, alphaSrcHi); |
| lo = _mm_adds_epu16 (lo, Mask0080); |
| hi = _mm_adds_epu16 (hi, Mask0080); |
| lo = _mm_adds_epu16 (lo, mulLo); |
| hi = _mm_adds_epu16 (hi, mulHi); |
| *retLo = _mm_mulhi_epu16 (lo, Mask0101); |
| *retHi = _mm_mulhi_epu16 (hi, Mask0101); |
| } |
| |
| static inline void |
| negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi) |
| { |
| *negLo = _mm_xor_si128 (dataLo, Mask00ff); |
| *negHi = _mm_xor_si128 (dataHi, Mask00ff); |
| } |
| |
| static inline void |
| invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi) |
| { |
| __m128i lo, hi; |
| |
| lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2)); |
| hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2)); |
| *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2)); |
| *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2)); |
| } |
| |
| static inline void |
| over_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaLo, __m128i alphaHi, __m128i* dstLo, __m128i* dstHi) |
| { |
| negate_2x128 (alphaLo, alphaHi, &alphaLo, &alphaHi); |
| |
| pixMultiply_2x128 (*dstLo, *dstHi, alphaLo, alphaHi, dstLo, dstHi); |
| |
| *dstLo = _mm_adds_epu8 (srcLo, *dstLo); |
| *dstHi = _mm_adds_epu8 (srcHi, *dstHi); |
| } |
| |
| static inline void |
| overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi) |
| { |
| __m128i lo, hi; |
| __m128i alphaLo, alphaHi; |
| |
| expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi); |
| |
| lo = _mm_or_si128 (alphaLo, MaskAlpha); |
| hi = _mm_or_si128 (alphaHi, MaskAlpha); |
| |
| invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi); |
| |
| pixMultiply_2x128 (srcLo, srcHi, lo, hi, &lo, &hi); |
| |
| over_2x128 (lo, hi, alphaLo, alphaHi, dstLo, dstHi); |
| } |
| |
| static inline void |
| inOver_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaLo, __m128i alphaHi, |
| __m128i maskLo, __m128i maskHi, __m128i* dstLo, __m128i* dstHi) |
| { |
| __m128i sLo, sHi; |
| __m128i aLo, aHi; |
| |
| pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi); |
| pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi); |
| |
| over_2x128 (sLo, sHi, aLo, aHi, dstLo, dstHi); |
| } |
| |
| static inline void |
| cachePrefetch (__m128i* addr) |
| { |
| _mm_prefetch (addr, _MM_HINT_T0); |
| } |
| |
| static inline void |
| cachePrefetchNext (__m128i* addr) |
| { |
| _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead |
| } |
| |
| /* load 4 pixels from a 16-byte boundary aligned address */ |
| static inline __m128i |
| load128Aligned (__m128i* src) |
| { |
| return _mm_load_si128 (src); |
| } |
| |
| /* load 4 pixels from a unaligned address */ |
| static inline __m128i |
| load128Unaligned (__m128i* src) |
| { |
| return _mm_loadu_si128 (src); |
| } |
| |
| /* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */ |
| static inline void |
| save128WriteCombining (__m128i* dst, __m128i data) |
| { |
| _mm_stream_si128 (dst, data); |
| } |
| |
| /* save 4 pixels on a 16-byte boundary aligned address */ |
| static inline void |
| save128Aligned (__m128i* dst, __m128i data) |
| { |
| _mm_store_si128 (dst, data); |
| } |
| |
| /* save 4 pixels on a unaligned address */ |
| static inline void |
| save128Unaligned (__m128i* dst, __m128i data) |
| { |
| _mm_storeu_si128 (dst, data); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * MMX inlines |
| */ |
| |
| static inline __m64 |
| unpack_32_1x64 (uint32_t data) |
| { |
| return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64()); |
| } |
| |
| static inline __m64 |
| expandAlpha_1x64 (__m64 data) |
| { |
| return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3)); |
| } |
| |
| static inline __m64 |
| expandAlphaRev_1x64 (__m64 data) |
| { |
| return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0)); |
| } |
| |
| static inline __m64 |
| expandPixel_8_1x64 (uint8_t data) |
| { |
| return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0)); |
| } |
| |
| static inline __m64 |
| pixMultiply_1x64 (__m64 data, __m64 alpha) |
| { |
| return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha), |
| xMask0080), |
| xMask0101); |
| } |
| |
| static inline __m64 |
| pixAddMultiply_1x64 (__m64 src, __m64 alphaDst, __m64 dst, __m64 alphaSrc) |
| { |
| return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (src, alphaDst), |
| xMask0080), |
| _mm_mullo_pi16 (dst, alphaSrc)), |
| xMask0101); |
| } |
| |
| static inline __m64 |
| negate_1x64 (__m64 data) |
| { |
| return _mm_xor_si64 (data, xMask00ff); |
| } |
| |
| static inline __m64 |
| invertColors_1x64 (__m64 data) |
| { |
| return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2)); |
| } |
| |
| static inline __m64 |
| over_1x64 (__m64 src, __m64 alpha, __m64 dst) |
| { |
| return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha))); |
| } |
| |
| static inline __m64 |
| inOver_1x64 (__m64 src, __m64 alpha, __m64 mask, __m64 dst) |
| { |
| return over_1x64 (pixMultiply_1x64 (src, mask), |
| pixMultiply_1x64 (alpha, mask), |
| dst); |
| } |
| |
| static inline __m64 |
| overRevNonPre_1x64 (__m64 src, __m64 dst) |
| { |
| __m64 alpha = expandAlpha_1x64 (src); |
| |
| return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src), |
| _mm_or_si64 (alpha, xMaskAlpha)), |
| alpha, |
| dst); |
| } |
| |
| static inline uint32_t |
| pack_1x64_32( __m64 data ) |
| { |
| return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64())); |
| } |
| |
| /* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
| * |
| * 00RR00GG00BB |
| * |
| * --- Expanding 565 in the low word --- |
| * |
| * m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
| * m = m & (01f0003f001f); |
| * m = m * (008404100840); |
| * m = m >> 8; |
| * |
| * Note the trick here - the top word is shifted by another nibble to |
| * avoid it bumping into the middle word |
| */ |
| static inline __m64 |
| expand565_16_1x64 (uint16_t pixel) |
| { |
| __m64 p; |
| __m64 t1, t2; |
| |
| p = _mm_cvtsi32_si64 ((uint32_t) pixel); |
| |
| t1 = _mm_slli_si64 (p, 36 - 11); |
| t2 = _mm_slli_si64 (p, 16 - 5); |
| |
| p = _mm_or_si64 (t1, p); |
| p = _mm_or_si64 (t2, p); |
| p = _mm_and_si64 (p, xMask565rgb); |
| p = _mm_mullo_pi16 (p, xMask565Unpack); |
| |
| return _mm_srli_pi16 (p, 8); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * Compose Core transformations |
| */ |
| static inline uint32_t |
| coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst) |
| { |
| uint8_t a; |
| __m64 ms; |
| |
| a = src >> 24; |
| |
| if (a == 0xff) |
| { |
| return src; |
| } |
| else if (a) |
| { |
| ms = unpack_32_1x64 (src); |
| return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst))); |
| } |
| |
| return dst; |
| } |
| |
| static inline void |
| coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, int w) |
| { |
| uint32_t pa; |
| uint32_t s, d; |
| |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| /* Align dst on a 16-byte boundary */ |
| while (w && |
| ((unsigned long)pd & 15)) |
| { |
| d = *pd; |
| s = *ps++; |
| |
| *pd++ = coreCombineOverUPixelsse2 (s, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| /* I'm loading unaligned because I'm not sure about the address alignment. */ |
| xmmSrcHi = load128Unaligned ((__m128i*) ps); |
| |
| /* Check the alpha channel */ |
| pa = packAlpha (xmmSrcHi); |
| |
| if (pa == 0xffffffff) |
| { |
| save128Aligned ((__m128i*)pd, xmmSrcHi); |
| } |
| else if (pa) |
| { |
| xmmDstHi = load128Aligned ((__m128i*) pd); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi); |
| |
| /* rebuid the 4 pixel data and save*/ |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| } |
| |
| w -= 4; |
| ps += 4; |
| pd += 4; |
| } |
| |
| while (w) |
| { |
| d = *pd; |
| s = *ps++; |
| |
| *pd++ = coreCombineOverUPixelsse2 (s, d); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, int w) |
| { |
| uint32_t s, d; |
| |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| /* Align dst on a 16-byte boundary */ |
| while (w && |
| ((unsigned long)pd & 15)) |
| { |
| d = *pd; |
| s = *ps++; |
| |
| *pd++ = coreCombineOverUPixelsse2 (d, s); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| /* I'm loading unaligned because I'm not sure about the address alignment. */ |
| xmmSrcHi = load128Unaligned ((__m128i*) ps); |
| xmmDstHi = load128Aligned ((__m128i*) pd); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| over_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmSrcLo, &xmmSrcHi); |
| |
| /* rebuid the 4 pixel data and save*/ |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi)); |
| |
| w -= 4; |
| ps += 4; |
| pd += 4; |
| } |
| |
| while (w) |
| { |
| d = *pd; |
| s = *ps++; |
| |
| *pd++ = coreCombineOverUPixelsse2 (d, s); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineInUPixelsse2 (uint32_t src, uint32_t dst) |
| { |
| uint32_t maska = src >> 24; |
| |
| if (maska == 0) |
| { |
| return 0; |
| } |
| else if (maska != 0xff) |
| { |
| return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src)))); |
| } |
| |
| return dst; |
| } |
| |
| static inline void |
| coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, int w) |
| { |
| uint32_t s, d; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && ((uint32_t) pd & 15)) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineInUPixelsse2 (d, s); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| xmmDstHi = load128Aligned ((__m128i*) pd); |
| xmmSrcHi = load128Unaligned ((__m128i*) ps); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineInUPixelsse2 (d, s); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, int w) |
| { |
| uint32_t s, d; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && ((uint32_t) pd & 15)) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineInUPixelsse2 (s, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| xmmDstHi = load128Aligned ((__m128i*) pd); |
| xmmSrcHi = load128Unaligned ((__m128i*) ps); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineInUPixelsse2 (s, d); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, int w) |
| { |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && ((uint32_t) pd & 15)) |
| { |
| uint32_t s = *ps++; |
| uint32_t d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s))))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| xmmSrcHi = load128Unaligned ((__m128i*) ps); |
| xmmDstHi = load128Aligned ((__m128i*) pd); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| |
| pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| uint32_t s = *ps++; |
| uint32_t d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s))))); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, int w) |
| { |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && ((uint32_t) pd & 15)) |
| { |
| uint32_t s = *ps++; |
| uint32_t d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| xmmSrcHi = load128Unaligned ((__m128i*) ps); |
| xmmDstHi = load128Aligned ((__m128i*) pd); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); |
| negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| uint32_t s = *ps++; |
| uint32_t d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst) |
| { |
| __m64 s = unpack_32_1x64 (src); |
| __m64 d = unpack_32_1x64 (dst); |
| |
| __m64 sa = negate_1x64 (expandAlpha_1x64 (s)); |
| __m64 da = expandAlpha_1x64 (d); |
| |
| return pack_1x64_32 (pixAddMultiply_1x64 (s, da, d, sa)); |
| } |
| |
| static inline void |
| coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w) |
| { |
| uint32_t s, d; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; |
| __m128i xmmAlphaDstLo, xmmAlphaDstHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && ((uint32_t) pd & 15)) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineAtopUPixelsse2 (s, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| xmmSrcHi = load128Unaligned ((__m128i*) ps); |
| xmmDstHi = load128Aligned ((__m128i*) pd); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); |
| |
| pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, |
| xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi, |
| &xmmDstLo, &xmmDstHi ); |
| |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineAtopUPixelsse2 (s, d); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst) |
| { |
| __m64 s = unpack_32_1x64 (src); |
| __m64 d = unpack_32_1x64 (dst); |
| |
| __m64 sa = expandAlpha_1x64 (s); |
| __m64 da = negate_1x64 (expandAlpha_1x64 (d)); |
| |
| return pack_1x64_32 (pixAddMultiply_1x64 (s, da, d, sa)); |
| } |
| |
| static inline void |
| coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w) |
| { |
| uint32_t s, d; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; |
| __m128i xmmAlphaDstLo, xmmAlphaDstHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && ((uint32_t) pd & 15)) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineReverseAtopUPixelsse2 (s, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| xmmSrcHi = load128Unaligned ((__m128i*) ps); |
| xmmDstHi = load128Aligned ((__m128i*) pd); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, |
| xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi, |
| &xmmDstLo, &xmmDstHi ); |
| |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineReverseAtopUPixelsse2 (s, d); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst) |
| { |
| __m64 s = unpack_32_1x64 (src); |
| __m64 d = unpack_32_1x64 (dst); |
| |
| return pack_1x64_32 (pixAddMultiply_1x64 (s, negate_1x64 (expandAlpha_1x64 (d)), d, negate_1x64 (expandAlpha_1x64 (s)))); |
| } |
| |
| static inline void |
| coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, int width) |
| { |
| int w = width; |
| uint32_t s, d; |
| uint32_t* pd = dst; |
| const uint32_t* ps = src; |
| |
| __m128i xmmSrc, xmmSrcLo, xmmSrcHi; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; |
| __m128i xmmAlphaDstLo, xmmAlphaDstHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && ((uint32_t) pd & 15)) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineXorUPixelsse2 (s, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| xmmSrc = load128Unaligned ((__m128i*) ps); |
| xmmDst = load128Aligned ((__m128i*) pd); |
| |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); |
| negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, |
| xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi, |
| &xmmDstLo, &xmmDstHi ); |
| |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| d = *pd; |
| |
| *pd++ = coreCombineXorUPixelsse2 (s, d); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, int width) |
| { |
| int w = width; |
| uint32_t s,d; |
| uint32_t* pd = dst; |
| const uint32_t* ps = src; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| d = *pd; |
| *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| save128Aligned( (__m128i*)pd, |
| _mm_adds_epu8( load128Unaligned((__m128i*)ps), |
| load128Aligned ((__m128i*)pd)) ); |
| pd += 4; |
| ps += 4; |
| w -= 4; |
| } |
| |
| while (w--) |
| { |
| s = *ps++; |
| d = *pd; |
| *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst) |
| { |
| __m64 ms = unpack_32_1x64 (src); |
| __m64 md = unpack_32_1x64 (dst); |
| uint32_t sa = src >> 24; |
| uint32_t da = ~dst >> 24; |
| |
| if (sa > da) |
| { |
| ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24))); |
| } |
| |
| return pack_1x64_32 (_mm_adds_pu16 (md, ms)); |
| } |
| |
| static inline void |
| coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, int w) |
| { |
| uint32_t s,d; |
| |
| uint32_t packCmp; |
| __m128i xmmSrc, xmmDst; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| d = *pd; |
| *pd++ = coreCombineSaturateUPixelsse2 (s, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| |
| xmmDst = load128Aligned ((__m128i*)pd); |
| xmmSrc = load128Unaligned((__m128i*)ps); |
| |
| packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24), |
| _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24))); |
| |
| /* if some alpha src is grater than respective ~alpha dst */ |
| if (packCmp) |
| { |
| s = *ps++; |
| d = *pd; |
| *pd++ = coreCombineSaturateUPixelsse2 (s, d); |
| |
| s = *ps++; |
| d = *pd; |
| *pd++ = coreCombineSaturateUPixelsse2 (s, d); |
| |
| s = *ps++; |
| d = *pd; |
| *pd++ = coreCombineSaturateUPixelsse2 (s, d); |
| |
| s = *ps++; |
| d = *pd; |
| *pd++ = coreCombineSaturateUPixelsse2 (s, d); |
| } |
| else |
| { |
| save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc)); |
| |
| pd += 4; |
| ps += 4; |
| } |
| |
| w -= 4; |
| } |
| |
| while (w--) |
| { |
| s = *ps++; |
| d = *pd; |
| *pd++ = coreCombineSaturateUPixelsse2 (s, d); |
| } |
| } |
| |
| static inline void |
| coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) |
| { |
| uint32_t s, m; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| __m128i xmmDstLo, xmmDstHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) |
| { |
| __m64 s = unpack_32_1x64 (src); |
| |
| return pack_1x64_32 (inOver_1x64 (s, expandAlpha_1x64 (s), unpack_32_1x64 (mask), unpack_32_1x64 (dst))); |
| } |
| |
| static inline void |
| coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineOverCPixelsse2 (s, m, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineOverCPixelsse2 (s, m, d); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) |
| { |
| __m64 d = unpack_32_1x64 (dst); |
| |
| return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask)))); |
| } |
| |
| static inline void |
| coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| over_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmMaskLo, &xmmMaskHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineInCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), |
| expandAlpha_1x64 (unpack_32_1x64 (d)))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); |
| |
| pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), |
| expandAlpha_1x64 (unpack_32_1x64 (d)))); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineInReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), |
| pixMultiply_1x64 (unpack_32_1x64 (m), |
| expandAlpha_1x64 (unpack_32_1x64 (s))))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); |
| pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), |
| pixMultiply_1x64 (unpack_32_1x64 (m), |
| expandAlpha_1x64 (unpack_32_1x64 (s))))); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineOutCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), |
| negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi); |
| negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); |
| pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), |
| negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d))))); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineOutReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), |
| negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m), |
| expandAlpha_1x64 (unpack_32_1x64 (s)))))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaLo, xmmAlphaHi, &xmmMaskLo, &xmmMaskHi); |
| |
| negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), |
| negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m), |
| expandAlpha_1x64 (unpack_32_1x64 (s)))))); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) |
| { |
| __m64 m = unpack_32_1x64 (mask); |
| __m64 s = unpack_32_1x64 (src); |
| __m64 d = unpack_32_1x64 (dst); |
| __m64 sa = expandAlpha_1x64 (s); |
| __m64 da = expandAlpha_1x64 (d); |
| |
| s = pixMultiply_1x64 (s, m); |
| m = negate_1x64 (pixMultiply_1x64 (m, sa)); |
| |
| return pack_1x64_32 (pixAddMultiply_1x64 (d, m, s, da)); |
| } |
| |
| static inline void |
| coreCombineAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; |
| __m128i xmmAlphaDstLo, xmmAlphaDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineAtopCPixelsse2 (s, m, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi); |
| pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); |
| |
| negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, |
| xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, |
| &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineAtopCPixelsse2 (s, m, d); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) |
| { |
| __m64 m = unpack_32_1x64 (mask); |
| __m64 s = unpack_32_1x64 (src); |
| __m64 d = unpack_32_1x64 (dst); |
| |
| __m64 da = negate_1x64 (expandAlpha_1x64 (d)); |
| __m64 sa = expandAlpha_1x64 (s); |
| |
| s = pixMultiply_1x64 (s, m); |
| m = pixMultiply_1x64 (m, sa); |
| |
| return pack_1x64_32 (pixAddMultiply_1x64 (d, m, s, da)); |
| } |
| |
| static inline void |
| coreCombineReverseAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; |
| __m128i xmmAlphaDstLo, xmmAlphaDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi); |
| pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); |
| |
| negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, |
| xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, |
| &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d); |
| w--; |
| } |
| } |
| |
| static inline uint32_t |
| coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst) |
| { |
| __m64 a = unpack_32_1x64 (mask); |
| __m64 s = unpack_32_1x64 (src); |
| __m64 d = unpack_32_1x64 (dst); |
| |
| return pack_1x64_32 (pixAddMultiply_1x64 (d, |
| negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s))), |
| pixMultiply_1x64 (s, a), |
| negate_1x64 (expandAlpha_1x64 (d)))); |
| } |
| |
| static inline void |
| coreCombineXorCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmAlphaSrcLo, xmmAlphaSrcHi; |
| __m128i xmmAlphaDstLo, xmmAlphaDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineXorCPixelsse2 (s, m, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi); |
| expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi); |
| pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi); |
| |
| negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi); |
| negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, |
| xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi, |
| &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = coreCombineXorCPixelsse2 (s, m, d); |
| w--; |
| } |
| } |
| |
| static inline void |
| coreCombineAddCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w) |
| { |
| uint32_t s, m, d; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| __m128i xmmMaskLo, xmmMaskHi; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s), |
| unpack_32_1x64 (m)), |
| unpack_32_1x64 (d))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)ps); |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)ps); |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmSrcHi = load128Unaligned ((__m128i*)ps); |
| xmmMaskHi = load128Unaligned ((__m128i*)pm); |
| xmmDstHi = load128Aligned ((__m128i*)pd); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi); |
| |
| save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo), |
| _mm_adds_epu8 (xmmSrcHi, xmmDstHi))); |
| |
| ps += 4; |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| s = *ps++; |
| m = *pm++; |
| d = *pd; |
| |
| *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s), |
| unpack_32_1x64 (m)), |
| unpack_32_1x64 (d))); |
| w--; |
| } |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbComposeSetupSSE |
| */ |
| static inline __m64 |
| createMask_16_64 (uint16_t mask) |
| { |
| return _mm_set1_pi16 (mask); |
| } |
| |
| static inline __m128i |
| createMask_16_128 (uint16_t mask) |
| { |
| return _mm_set1_epi16 (mask); |
| } |
| |
| static inline __m64 |
| createMask_2x32_64 (uint32_t mask0, uint32_t mask1) |
| { |
| return _mm_set_pi32 (mask0, mask1); |
| } |
| |
| static inline __m128i |
| createMask_2x32_128 (uint32_t mask0, uint32_t mask1) |
| { |
| return _mm_set_epi32 (mask0, mask1, mask0, mask1); |
| } |
| |
| /* SSE2 code patch for fbcompose.c */ |
| |
| static FASTCALL void |
| sse2CombineMaskU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineReverseInUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineOverU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineOverUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineOverReverseU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineOverReverseUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineInU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineInUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineInReverseU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineReverseInUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineOutU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineOutUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineOutReverseU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineReverseOutUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineAtopU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineAtopUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineAtopReverseU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineReverseAtopUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineXorU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineXorUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineAddU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineAddUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineSaturateU (uint32_t *dst, const uint32_t *src, int width) |
| { |
| coreCombineSaturateUsse2 (dst, src, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineSrcC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineSrcCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineOverC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineOverCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineOverReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineOverReverseCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineInC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineInCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineInReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineInReverseCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineOutC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineOutCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineOutReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineOutReverseCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineAtopC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineAtopCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineAtopReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineReverseAtopCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineXorC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineXorCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| static FASTCALL void |
| sse2CombineAddC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width) |
| { |
| coreCombineAddCsse2 (dst, src, mask, width); |
| _mm_empty(); |
| } |
| |
| void |
| fbComposeSetupSSE(void) |
| { |
| static pixman_bool_t initialized = FALSE; |
| |
| if (initialized) |
| return; |
| |
| /* check if we have SSE2 support and initialize accordingly */ |
| if (pixman_have_sse()) |
| { |
| /* SSE2 constants */ |
| Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000); |
| Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000); |
| Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0); |
| Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f); |
| MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000); |
| MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00); |
| MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8); |
| |
| Mask0080 = createMask_16_128 (0x0080); |
| Mask00ff = createMask_16_128 (0x00ff); |
| Mask0101 = createMask_16_128 (0x0101); |
| Maskffff = createMask_16_128 (0xffff); |
| Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000); |
| MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000); |
| |
| /* MMX constants */ |
| xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f); |
| xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840); |
| |
| xMask0080 = createMask_16_64 (0x0080); |
| xMask00ff = createMask_16_64 (0x00ff); |
| xMask0101 = createMask_16_64 (0x0101); |
| xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000); |
| |
| /* SSE code patch for fbcompose.c */ |
| pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = sse2CombineOverU; |
| pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU; |
| pixman_composeFunctions.combineU[PIXMAN_OP_IN] = sse2CombineInU; |
| pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU; |
| pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = sse2CombineOutU; |
| |
| pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU; |
| pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = sse2CombineAtopU; |
| pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU; |
| pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = sse2CombineXorU; |
| pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = sse2CombineAddU; |
| |
| pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = sse2CombineSaturateU; |
| |
| pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = sse2CombineSrcC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = sse2CombineOverC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_IN] = sse2CombineInC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = sse2CombineOutC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = sse2CombineAtopC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = sse2CombineXorC; |
| pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = sse2CombineAddC; |
| |
| pixman_composeFunctions.combineMaskU = sse2CombineMaskU; |
| } |
| |
| initialized = TRUE; |
| |
| _mm_empty(); |
| } |
| |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSolid_nx8888 |
| */ |
| |
| void |
| fbCompositeSolid_nx8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t src; |
| uint32_t *dstLine, *dst, d; |
| uint16_t w; |
| int dstStride; |
| __m128i xmmSrc, xmmAlpha; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| if (src >> 24 == 0) |
| return; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| |
| xmmSrc = expandPixel_32_1x128 (src); |
| xmmAlpha = expandAlpha_1x128 (xmmSrc); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)dst); |
| |
| dstLine += dstStride; |
| w = width; |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| d = *dst; |
| *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| unpack_32_1x64 (d))); |
| w--; |
| } |
| |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)dst); |
| |
| xmmDst = load128Aligned ((__m128i*)dst); |
| |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| |
| over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDstLo, &xmmDstHi); |
| |
| /* rebuid the 4 pixel data and save*/ |
| save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| w -= 4; |
| dst += 4; |
| } |
| |
| while (w) |
| { |
| d = *dst; |
| *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| unpack_32_1x64 (d))); |
| w--; |
| } |
| |
| } |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSolid_nx0565 |
| */ |
| void |
| fbCompositeSolid_nx0565sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t src; |
| uint16_t *dstLine, *dst, d; |
| uint16_t w; |
| int dstStride; |
| __m128i xmmSrc, xmmAlpha; |
| __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| if (src >> 24 == 0) |
| return; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); |
| |
| xmmSrc = expandPixel_32_1x128 (src); |
| xmmAlpha = expandAlpha_1x128 (xmmSrc); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)dst); |
| |
| dstLine += dstStride; |
| w = width; |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| d = *dst; |
| *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| expand565_16_1x64 (d)))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 8) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)dst); |
| |
| xmmDst = load128Aligned ((__m128i*)dst); |
| |
| unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); |
| |
| over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDst0, &xmmDst1); |
| over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDst2, &xmmDst3); |
| |
| xmmDst = pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3); |
| |
| save128Aligned ((__m128i*)dst, xmmDst); |
| |
| dst += 8; |
| w -= 8; |
| } |
| |
| while (w--) |
| { |
| d = *dst; |
| *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| expand565_16_1x64 (d)))); |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSolidMask_nx8888x8888C |
| */ |
| |
| void |
| fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t src, srca; |
| uint32_t *dstLine, d; |
| uint32_t *maskLine, m; |
| uint32_t packCmp; |
| int dstStride, maskStride; |
| |
| __m128i xmmSrc, xmmAlpha; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| __m128i xmmMask, xmmMaskLo, xmmMaskHi; |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| srca = src >> 24; |
| if (srca == 0) |
| return; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1); |
| |
| xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ()); |
| xmmAlpha = expandAlpha_1x128 (xmmSrc); |
| |
| while (height--) |
| { |
| int w = width; |
| uint32_t *pm = (uint32_t *)maskLine; |
| uint32_t *pd = (uint32_t *)dstLine; |
| |
| dstLine += dstStride; |
| maskLine += maskStride; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w && (unsigned long)pd & 15) |
| { |
| m = *pm++; |
| |
| if (m) |
| { |
| d = *pd; |
| |
| *pd = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| unpack_32_1x64 (m), |
| unpack_32_1x64 (d))); |
| } |
| |
| pd++; |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)pd); |
| cachePrefetch ((__m128i*)pm); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)pd); |
| cachePrefetchNext ((__m128i*)pm); |
| |
| xmmMask = load128Unaligned ((__m128i*)pm); |
| |
| packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); |
| |
| /* if all bits in mask are zero, packCmp are equal to 0xffff */ |
| if (packCmp != 0xffff) |
| { |
| xmmDst = load128Aligned ((__m128i*)pd); |
| |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| |
| inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| } |
| |
| pd += 4; |
| pm += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| m = *pm++; |
| |
| if (m) |
| { |
| d = *pd; |
| |
| *pd = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| unpack_32_1x64 (m), |
| unpack_32_1x64 (d))); |
| } |
| |
| pd++; |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrc_8888x8x8888 |
| */ |
| |
| void |
| fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t *dstLine, *dst; |
| uint32_t *srcLine, *src; |
| uint32_t mask; |
| uint16_t w; |
| int dstStride, srcStride; |
| |
| __m128i xmmMask; |
| __m128i xmmSrc, xmmSrcLo, xmmSrcHi; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); |
| fbComposeGetSolid (pMask, mask, pDst->bits.format); |
| |
| xmmMask = createMask_16_128 (mask >> 24); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| src = srcLine; |
| srcLine += srcStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)dst); |
| cachePrefetch ((__m128i*)src); |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| uint32_t s = *src++; |
| uint32_t d = *dst; |
| |
| __m64 ms = unpack_32_1x64 (s); |
| |
| *dst++ = pack_1x64_32 (inOver_1x64 (ms, |
| expandAlpha_1x64 (ms), |
| _mm_movepi64_pi64 (xmmMask), |
| unpack_32_1x64 (d))); |
| |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)dst); |
| cachePrefetch ((__m128i*)src); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)dst); |
| cachePrefetchNext ((__m128i*)src); |
| |
| xmmSrc = load128Unaligned ((__m128i*)src); |
| xmmDst = load128Aligned ((__m128i*)dst); |
| |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, xmmMask, xmmMask, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| dst += 4; |
| src += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| uint32_t s = *src++; |
| uint32_t d = *dst; |
| |
| __m64 ms = unpack_32_1x64 (s); |
| |
| *dst++ = pack_1x64_32 (inOver_1x64 (ms, |
| expandAlpha_1x64 (ms), |
| _mm_movepi64_pi64 (xmmMask), |
| unpack_32_1x64 (d))); |
| |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrc_x888xnx8888 |
| */ |
| void |
| fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t *dstLine, *dst; |
| uint32_t *srcLine, *src; |
| uint32_t mask; |
| int dstStride, srcStride; |
| uint16_t w; |
| |
| __m128i xmmMask, xmmAlpha; |
| __m128i xmmSrc, xmmSrcLo, xmmSrcHi; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); |
| fbComposeGetSolid (pMask, mask, pDst->bits.format); |
| |
| xmmMask = createMask_16_128 (mask >> 24); |
| xmmAlpha = Mask00ff; |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| src = srcLine; |
| srcLine += srcStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)dst); |
| cachePrefetch ((__m128i*)src); |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| uint32_t s = (*src++) | 0xff000000; |
| uint32_t d = *dst; |
| |
| *dst++ = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s), |
| _mm_movepi64_pi64 (xmmAlpha), |
| _mm_movepi64_pi64 (xmmMask), |
| unpack_32_1x64 (d))); |
| |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)dst); |
| cachePrefetch ((__m128i*)src); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)dst); |
| cachePrefetchNext ((__m128i*)src); |
| |
| xmmSrc = load128Unaligned ((__m128i*)src); |
| xmmDst = load128Aligned ((__m128i*)dst); |
| |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| |
| inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlpha, xmmAlpha, xmmMask, xmmMask, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| dst += 4; |
| src += 4; |
| w -= 4; |
| |
| } |
| |
| while (w) |
| { |
| uint32_t s = (*src++) | 0xff000000; |
| uint32_t d = *dst; |
| |
| *dst++ = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s), |
| _mm_movepi64_pi64 (xmmAlpha), |
| _mm_movepi64_pi64 (xmmMask), |
| unpack_32_1x64 (d))); |
| |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrc_8888x8888 |
| */ |
| void |
| fbCompositeSrc_8888x8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| int dstStride, srcStride; |
| uint32_t *dstLine, *dst; |
| uint32_t *srcLine, *src; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); |
| |
| dst = dstLine; |
| src = srcLine; |
| |
| while (height--) |
| { |
| coreCombineOverUsse2 (dst, src, width); |
| |
| dst += dstStride; |
| src += srcStride; |
| } |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrc_8888x0565 |
| */ |
| static inline uint16_t |
| fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst) |
| { |
| __m64 ms; |
| |
| ms = unpack_32_1x64 (src); |
| return pack565_32_16( pack_1x64_32 (over_1x64 (ms, |
| expandAlpha_1x64 (ms), |
| expand565_16_1x64 (dst)))); |
| } |
| |
| void |
| fbCompositeSrc_8888x0565sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint16_t *dstLine, *dst, d; |
| uint32_t *srcLine, *src, s; |
| int dstStride, srcStride; |
| uint16_t w; |
| |
| __m128i xmmAlphaLo, xmmAlphaHi; |
| __m128i xmmSrc, xmmSrcLo, xmmSrcHi; |
| __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); |
| |
| #if 0 |
| /* FIXME |
| * |
| * I copy the code from MMX one and keep the fixme. |
| * If it's a problem there, probably is a problem here. |
| */ |
| assert (pSrc->pDrawable == pMask->pDrawable); |
| #endif |
| |
| while (height--) |
| { |
| dst = dstLine; |
| src = srcLine; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| dstLine += dstStride; |
| srcLine += srcStride; |
| w = width; |
| |
| /* Align dst on a 16-byte boundary */ |
| while (w && |
| ((unsigned long)dst & 15)) |
| { |
| s = *src++; |
| d = *dst; |
| |
| *dst++ = fbCompositeSrc_8888x0565pixel (s, d); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| /* It's a 8 pixel loop */ |
| while (w >= 8) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)src); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| /* I'm loading unaligned because I'm not sure about the address alignment. */ |
| xmmSrc = load128Unaligned ((__m128i*) src); |
| xmmDst = load128Aligned ((__m128i*) dst); |
| |
| /* Unpacking */ |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| /* I'm loading next 4 pixels from memory before to optimze the memory read. */ |
| xmmSrc = load128Unaligned ((__m128i*) (src+4)); |
| |
| over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDst0, &xmmDst1); |
| |
| /* Unpacking */ |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi); |
| |
| over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDst2, &xmmDst3); |
| |
| save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3)); |
| |
| w -= 8; |
| dst += 8; |
| src += 8; |
| } |
| |
| while (w--) |
| { |
| s = *src++; |
| d = *dst; |
| |
| *dst++ = fbCompositeSrc_8888x0565pixel (s, d); |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSolidMask_nx8x8888 |
| */ |
| |
| void |
| fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t src, srca; |
| uint32_t *dstLine, *dst; |
| uint8_t *maskLine, *mask; |
| int dstStride, maskStride; |
| uint16_t w; |
| uint32_t m, d; |
| |
| __m128i xmmSrc, xmmAlpha, xmmDef; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| __m128i xmmMask, xmmMaskLo, xmmMaskHi; |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| srca = src >> 24; |
| if (srca == 0) |
| return; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); |
| |
| xmmDef = createMask_2x32_128 (src, src); |
| xmmSrc = expandPixel_32_1x128 (src); |
| xmmAlpha = expandAlpha_1x128 (xmmSrc); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| mask = maskLine; |
| maskLine += maskStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| uint8_t m = *mask++; |
| |
| if (m) |
| { |
| d = *dst; |
| |
| *dst = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| expandPixel_8_1x64 (m), |
| unpack_32_1x64 (d))); |
| } |
| |
| w--; |
| dst++; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)mask); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| m = *((uint32_t*)mask); |
| |
| if (srca == 0xff && m == 0xffffffff) |
| { |
| save128Aligned ((__m128i*)dst, xmmDef); |
| } |
| else if (m) |
| { |
| xmmDst = load128Aligned ((__m128i*) dst); |
| xmmMask = unpack_32_1x128 (m); |
| xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); |
| |
| /* Unpacking */ |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| } |
| |
| w -= 4; |
| dst += 4; |
| mask += 4; |
| } |
| |
| while (w) |
| { |
| uint8_t m = *mask++; |
| |
| if (m) |
| { |
| d = *dst; |
| |
| *dst = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| expandPixel_8_1x64 (m), |
| unpack_32_1x64 (d))); |
| } |
| |
| w--; |
| dst++; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSolidMask_nx8x8888 |
| */ |
| |
| pixman_bool_t |
| pixmanFillsse2 (uint32_t *bits, |
| int stride, |
| int bpp, |
| int x, |
| int y, |
| int width, |
| int height, |
| uint32_t data) |
| { |
| uint32_t byte_width; |
| uint8_t *byte_line; |
| |
| __m128i xmmDef; |
| |
| if (bpp == 16 && (data >> 16 != (data & 0xffff))) |
| return FALSE; |
| |
| if (bpp != 16 && bpp != 32) |
| return FALSE; |
| |
| if (bpp == 16) |
| { |
| stride = stride * (int) sizeof (uint32_t) / 2; |
| byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
| byte_width = 2 * width; |
| stride *= 2; |
| } |
| else |
| { |
| stride = stride * (int) sizeof (uint32_t) / 4; |
| byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
| byte_width = 4 * width; |
| stride *= 4; |
| } |
| |
| cachePrefetch ((__m128i*)byte_line); |
| xmmDef = createMask_2x32_128 (data, data); |
| |
| while (height--) |
| { |
| int w; |
| uint8_t *d = byte_line; |
| byte_line += stride; |
| w = byte_width; |
| |
| |
| cachePrefetchNext ((__m128i*)d); |
| |
| while (w >= 2 && ((unsigned long)d & 3)) |
| { |
| *(uint16_t *)d = data; |
| w -= 2; |
| d += 2; |
| } |
| |
| while (w >= 4 && ((unsigned long)d & 15)) |
| { |
| *(uint32_t *)d = data; |
| |
| w -= 4; |
| d += 4; |
| } |
| |
| cachePrefetchNext ((__m128i*)d); |
| |
| while (w >= 128) |
| { |
| cachePrefetch (((__m128i*)d) + 12); |
| |
| save128Aligned ((__m128i*)(d), xmmDef); |
| save128Aligned ((__m128i*)(d+16), xmmDef); |
| save128Aligned ((__m128i*)(d+32), xmmDef); |
| save128Aligned ((__m128i*)(d+48), xmmDef); |
| save128Aligned ((__m128i*)(d+64), xmmDef); |
| save128Aligned ((__m128i*)(d+80), xmmDef); |
| save128Aligned ((__m128i*)(d+96), xmmDef); |
| save128Aligned ((__m128i*)(d+112), xmmDef); |
| |
| d += 128; |
| w -= 128; |
| } |
| |
| if (w >= 64) |
| { |
| cachePrefetch (((__m128i*)d) + 8); |
| |
| save128Aligned ((__m128i*)(d), xmmDef); |
| save128Aligned ((__m128i*)(d+16), xmmDef); |
| save128Aligned ((__m128i*)(d+32), xmmDef); |
| save128Aligned ((__m128i*)(d+48), xmmDef); |
| |
| d += 64; |
| w -= 64; |
| } |
| |
| cachePrefetchNext ((__m128i*)d); |
| |
| if (w >= 32) |
| { |
| save128Aligned ((__m128i*)(d), xmmDef); |
| save128Aligned ((__m128i*)(d+16), xmmDef); |
| |
| d += 32; |
| w -= 32; |
| } |
| |
| if (w >= 16) |
| { |
| save128Aligned ((__m128i*)(d), xmmDef); |
| |
| d += 16; |
| w -= 16; |
| } |
| |
| cachePrefetchNext ((__m128i*)d); |
| |
| while (w >= 4) |
| { |
| *(uint32_t *)d = data; |
| |
| w -= 4; |
| d += 4; |
| } |
| |
| if (w >= 2) |
| { |
| *(uint16_t *)d = data; |
| w -= 2; |
| d += 2; |
| } |
| } |
| |
| _mm_empty(); |
| return TRUE; |
| } |
| |
| void |
| fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t src, srca; |
| uint32_t *dstLine, *dst; |
| uint8_t *maskLine, *mask; |
| int dstStride, maskStride; |
| uint16_t w; |
| uint32_t m; |
| |
| __m128i xmmSrc, xmmDef; |
| __m128i xmmMask, xmmMaskLo, xmmMaskHi; |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| srca = src >> 24; |
| if (srca == 0) |
| { |
| pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride, |
| PIXMAN_FORMAT_BPP (pDst->bits.format), |
| xDst, yDst, width, height, 0); |
| return; |
| } |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); |
| |
| xmmDef = createMask_2x32_128 (src, src); |
| xmmSrc = expandPixel_32_1x128 (src); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| mask = maskLine; |
| maskLine += maskStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| uint8_t m = *mask++; |
| |
| if (m) |
| { |
| *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m))); |
| } |
| else |
| { |
| *dst = 0; |
| } |
| |
| w--; |
| dst++; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)mask); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| m = *((uint32_t*)mask); |
| |
| if (srca == 0xff && m == 0xffffffff) |
| { |
| save128Aligned ((__m128i*)dst, xmmDef); |
| } |
| else if (m) |
| { |
| xmmMask = unpack_32_1x128 (m); |
| xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); |
| |
| /* Unpacking */ |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| pixMultiply_2x128 (xmmSrc, xmmSrc, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi)); |
| } |
| else |
| { |
| save128Aligned ((__m128i*)dst, _mm_setzero_si128()); |
| } |
| |
| w -= 4; |
| dst += 4; |
| mask += 4; |
| } |
| |
| while (w) |
| { |
| uint8_t m = *mask++; |
| |
| if (m) |
| { |
| *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m))); |
| } |
| else |
| { |
| *dst = 0; |
| } |
| |
| w--; |
| dst++; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSolidMask_nx8x0565 |
| */ |
| |
| void |
| fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t src, srca; |
| uint16_t *dstLine, *dst, d; |
| uint8_t *maskLine, *mask; |
| int dstStride, maskStride; |
| uint16_t w; |
| uint32_t m; |
| |
| __m128i xmmSrc, xmmAlpha; |
| __m128i xmmMask, xmmMaskLo, xmmMaskHi; |
| __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| srca = src >> 24; |
| if (srca == 0) |
| return; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); |
| |
| xmmSrc = expandPixel_32_1x128 (src); |
| xmmAlpha = expandAlpha_1x128 (xmmSrc); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| mask = maskLine; |
| maskLine += maskStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| m = *mask++; |
| |
| if (m) |
| { |
| d = *dst; |
| |
| *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| expandAlphaRev_1x64 (unpack_32_1x64 (m)), |
| expand565_16_1x64 (d)))); |
| } |
| |
| w--; |
| dst++; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 8) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)mask); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| xmmDst = load128Aligned ((__m128i*) dst); |
| unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); |
| |
| m = *((uint32_t*)mask); |
| mask += 4; |
| |
| if (m) |
| { |
| xmmMask = unpack_32_1x128 (m); |
| xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); |
| |
| /* Unpacking */ |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst0, &xmmDst1); |
| } |
| |
| m = *((uint32_t*)mask); |
| mask += 4; |
| |
| if (m) |
| { |
| xmmMask = unpack_32_1x128 (m); |
| xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128()); |
| |
| /* Unpacking */ |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| |
| expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst2, &xmmDst3); |
| } |
| |
| save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3)); |
| |
| w -= 8; |
| dst += 8; |
| } |
| |
| while (w) |
| { |
| m = *mask++; |
| |
| if (m) |
| { |
| d = *dst; |
| |
| *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| expandAlphaRev_1x64 (unpack_32_1x64 (m)), |
| expand565_16_1x64 (d)))); |
| } |
| |
| w--; |
| dst++; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrc_8888RevNPx0565 |
| */ |
| |
| void |
| fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint16_t *dstLine, *dst, d; |
| uint32_t *srcLine, *src, s; |
| int dstStride, srcStride; |
| uint16_t w; |
| uint32_t packCmp; |
| |
| __m64 ms; |
| __m128i xmmSrc, xmmSrcLo, xmmSrcHi; |
| __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); |
| |
| #if 0 |
| /* FIXME |
| * |
| * I copy the code from MMX one and keep the fixme. |
| * If it's a problem there, probably is a problem here. |
| */ |
| assert (pSrc->pDrawable == pMask->pDrawable); |
| #endif |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| src = srcLine; |
| srcLine += srcStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| s = *src++; |
| d = *dst; |
| |
| ms = unpack_32_1x64 (s); |
| |
| *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d)))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 8) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)src); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| /* First round */ |
| xmmSrc = load128Unaligned((__m128i*)src); |
| xmmDst = load128Aligned ((__m128i*)dst); |
| |
| packCmp = packAlpha (xmmSrc); |
| |
| unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| |
| /* preload next round*/ |
| xmmSrc = load128Unaligned((__m128i*)(src+4)); |
| /* preload next round*/ |
| |
| if (packCmp == 0xffffffff) |
| { |
| invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1); |
| } |
| else if (packCmp) |
| { |
| overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1); |
| } |
| |
| /* Second round */ |
| packCmp = packAlpha (xmmSrc); |
| |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| |
| if (packCmp == 0xffffffff) |
| { |
| invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3); |
| } |
| else if (packCmp) |
| { |
| overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3); |
| } |
| |
| save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3)); |
| |
| w -= 8; |
| src += 8; |
| dst += 8; |
| } |
| |
| while (w) |
| { |
| s = *src++; |
| d = *dst; |
| |
| ms = unpack_32_1x64 (s); |
| |
| *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d)))); |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrc_8888RevNPx8888 |
| */ |
| |
| void |
| fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t *dstLine, *dst, d; |
| uint32_t *srcLine, *src, s; |
| int dstStride, srcStride; |
| uint16_t w; |
| uint32_t packCmp; |
| |
| __m128i xmmSrcLo, xmmSrcHi; |
| __m128i xmmDstLo, xmmDstHi; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); |
| |
| #if 0 |
| /* FIXME |
| * |
| * I copy the code from MMX one and keep the fixme. |
| * If it's a problem there, probably is a problem here. |
| */ |
| assert (pSrc->pDrawable == pMask->pDrawable); |
| #endif |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| src = srcLine; |
| srcLine += srcStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| s = *src++; |
| d = *dst; |
| |
| *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); |
| |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)src); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| xmmSrcHi = load128Unaligned((__m128i*)src); |
| |
| packCmp = packAlpha (xmmSrcHi); |
| |
| unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi); |
| |
| if (packCmp == 0xffffffff) |
| { |
| invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| } |
| else if (packCmp) |
| { |
| xmmDstHi = load128Aligned ((__m128i*)dst); |
| |
| unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| } |
| |
| w -= 4; |
| dst += 4; |
| src += 4; |
| } |
| |
| while (w) |
| { |
| s = *src++; |
| d = *dst; |
| |
| *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); |
| |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSolidMask_nx8888x0565C |
| */ |
| |
| void |
| fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t src, srca; |
| uint16_t *dstLine, *dst, d; |
| uint32_t *maskLine, *mask, m; |
| int dstStride, maskStride; |
| int w; |
| uint32_t packCmp; |
| |
| __m128i xmmSrc, xmmAlpha; |
| __m128i xmmMask, xmmMaskLo, xmmMaskHi; |
| __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3; |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| srca = src >> 24; |
| if (srca == 0) |
| return; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1); |
| |
| xmmSrc = unpack_32_1x128 (src); |
| xmmAlpha = expandAlpha_1x128 (xmmSrc); |
| |
| while (height--) |
| { |
| w = width; |
| mask = maskLine; |
| dst = dstLine; |
| maskLine += maskStride; |
| dstLine += dstStride; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && ((unsigned long)dst & 15)) |
| { |
| m = *(uint32_t *) mask; |
| |
| if (m) |
| { |
| d = *dst; |
| |
| *dst++ = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| unpack_32_1x64 (m), |
| expand565_16_1x64 (d)))); |
| } |
| |
| w--; |
| dst++; |
| mask++; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 8) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)mask); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| /* First round */ |
| xmmMask = load128Unaligned((__m128i*)mask); |
| xmmDst = load128Aligned((__m128i*)dst); |
| |
| packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); |
| |
| unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3); |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| |
| /* preload next round*/ |
| xmmMask = load128Unaligned((__m128i*)(mask+4)); |
| /* preload next round*/ |
| |
| if (packCmp != 0xffff) |
| { |
| inOver_2x128(xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst0, &xmmDst1); |
| } |
| |
| /* Second round */ |
| packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128())); |
| |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| |
| if (packCmp != 0xffff) |
| { |
| inOver_2x128(xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst2, &xmmDst3); |
| } |
| |
| save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3)); |
| |
| w -= 8; |
| dst += 8; |
| mask += 8; |
| } |
| |
| while (w) |
| { |
| m = *(uint32_t *) mask; |
| |
| if (m) |
| { |
| d = *dst; |
| |
| *dst++ = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc), |
| _mm_movepi64_pi64 (xmmAlpha), |
| unpack_32_1x64 (m), |
| expand565_16_1x64 (d)))); |
| } |
| |
| w--; |
| dst++; |
| mask++; |
| } |
| } |
| |
| _mm_empty (); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeIn_nx8x8 |
| */ |
| |
| void |
| fbCompositeIn_nx8x8sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint8_t *dstLine, *dst; |
| uint8_t *maskLine, *mask; |
| int dstStride, maskStride; |
| uint16_t w, d, m; |
| uint32_t src; |
| uint8_t sa; |
| |
| __m128i xmmAlpha; |
| __m128i xmmMask, xmmMaskLo, xmmMaskHi; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| sa = src >> 24; |
| if (sa == 0) |
| return; |
| |
| xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src)); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| mask = maskLine; |
| maskLine += maskStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && ((unsigned long)dst & 15)) |
| { |
| m = (uint32_t) *mask++; |
| d = (uint32_t) *dst; |
| |
| *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), |
| unpack_32_1x64 (d))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 16) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)mask); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| xmmMask = load128Unaligned((__m128i*)mask); |
| xmmDst = load128Aligned((__m128i*)dst); |
| |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| |
| pixMultiply_2x128 (xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| mask += 16; |
| dst += 16; |
| w -= 16; |
| } |
| |
| while (w) |
| { |
| m = (uint32_t) *mask++; |
| d = (uint32_t) *dst; |
| |
| *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), |
| unpack_32_1x64 (d))); |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeIn_8x8 |
| */ |
| |
| void |
| fbCompositeIn_8x8sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint8_t *dstLine, *dst; |
| uint8_t *srcLine, *src; |
| int srcStride, dstStride; |
| uint16_t w; |
| uint32_t s, d; |
| |
| __m128i xmmSrc, xmmSrcLo, xmmSrcHi; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| src = srcLine; |
| srcLine += srcStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && ((unsigned long)dst & 15)) |
| { |
| s = (uint32_t) *src++; |
| d = (uint32_t) *dst; |
| |
| *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 16) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)src); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| xmmSrc = load128Unaligned((__m128i*)src); |
| xmmDst = load128Aligned((__m128i*)dst); |
| |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| |
| pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| src += 16; |
| dst += 16; |
| w -= 16; |
| } |
| |
| while (w) |
| { |
| s = (uint32_t) *src++; |
| d = (uint32_t) *dst; |
| |
| *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d))); |
| w--; |
| } |
| } |
| |
| _mm_empty (); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrcAdd_8888x8x8 |
| */ |
| |
| void |
| fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint8_t *dstLine, *dst; |
| uint8_t *maskLine, *mask; |
| int dstStride, maskStride; |
| uint16_t w; |
| uint32_t src; |
| uint8_t sa; |
| uint32_t m, d; |
| |
| __m128i xmmAlpha; |
| __m128i xmmMask, xmmMaskLo, xmmMaskHi; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); |
| |
| fbComposeGetSolid(pSrc, src, pDst->bits.format); |
| |
| sa = src >> 24; |
| if (sa == 0) |
| return; |
| |
| xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src)); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| mask = maskLine; |
| maskLine += maskStride; |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w && ((unsigned long)dst & 15)) |
| { |
| m = (uint32_t) *mask++; |
| d = (uint32_t) *dst; |
| |
| *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), |
| unpack_32_1x64 (d))); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)mask); |
| cachePrefetch ((__m128i*)dst); |
| |
| while (w >= 16) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)mask); |
| cachePrefetchNext ((__m128i*)dst); |
| |
| xmmMask = load128Unaligned((__m128i*)mask); |
| xmmDst = load128Aligned((__m128i*)dst); |
| |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| |
| pixMultiply_2x128 (xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo); |
| xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi); |
| |
| save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| |
| mask += 16; |
| dst += 16; |
| w -= 16; |
| } |
| |
| while (w) |
| { |
| m = (uint32_t) *mask++; |
| d = (uint32_t) *dst; |
| |
| *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)), |
| unpack_32_1x64 (d))); |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrcAdd_8000x8000 |
| */ |
| |
| void |
| fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint8_t *dstLine, *dst; |
| uint8_t *srcLine, *src; |
| int dstStride, srcStride; |
| uint16_t w; |
| uint16_t t; |
| |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1); |
| fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| src = srcLine; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| |
| dstLine += dstStride; |
| srcLine += srcStride; |
| w = width; |
| |
| /* Small head */ |
| while (w && (unsigned long)dst & 3) |
| { |
| t = (*dst) + (*src++); |
| *dst++ = t | (0 - (t >> 8)); |
| w--; |
| } |
| |
| coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, w >> 2); |
| |
| /* Small tail */ |
| dst += w & 0xfffc; |
| src += w & 0xfffc; |
| |
| w &= 3; |
| |
| while (w) |
| { |
| t = (*dst) + (*src++); |
| *dst++ = t | (0 - (t >> 8)); |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeSrcAdd_8888x8888 |
| */ |
| void |
| fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t *dstLine, *dst; |
| uint32_t *srcLine, *src; |
| int dstStride, srcStride; |
| |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| |
| while (height--) |
| { |
| dst = dstLine; |
| dstLine += dstStride; |
| src = srcLine; |
| srcLine += srcStride; |
| |
| coreCombineAddUsse2 (dst, src, width); |
| } |
| |
| _mm_empty(); |
| } |
| |
| /* ------------------------------------------------------------------------------------------------- |
| * fbCompositeCopyAreasse2 |
| */ |
| |
| pixman_bool_t |
| pixmanBltsse2 (uint32_t *src_bits, |
| uint32_t *dst_bits, |
| int src_stride, |
| int dst_stride, |
| int src_bpp, |
| int dst_bpp, |
| int src_x, int src_y, |
| int dst_x, int dst_y, |
| int width, int height) |
| { |
| uint8_t * src_bytes; |
| uint8_t * dst_bytes; |
| int byte_width; |
| |
| if (src_bpp != dst_bpp) |
| return FALSE; |
| |
| if (src_bpp == 16) |
| { |
| src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
| dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
| src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
| dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); |
| byte_width = 2 * width; |
| src_stride *= 2; |
| dst_stride *= 2; |
| } |
| else if (src_bpp == 32) |
| { |
| src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
| dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
| src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
| dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); |
| byte_width = 4 * width; |
| src_stride *= 4; |
| dst_stride *= 4; |
| } |
| else |
| { |
| return FALSE; |
| } |
| |
| cachePrefetch ((__m128i*)src_bytes); |
| cachePrefetch ((__m128i*)dst_bytes); |
| |
| while (height--) |
| { |
| int w; |
| uint8_t *s = src_bytes; |
| uint8_t *d = dst_bytes; |
| src_bytes += src_stride; |
| dst_bytes += dst_stride; |
| w = byte_width; |
| |
| cachePrefetchNext ((__m128i*)s); |
| cachePrefetchNext ((__m128i*)d); |
| |
| while (w >= 2 && ((unsigned long)d & 3)) |
| { |
| *(uint16_t *)d = *(uint16_t *)s; |
| w -= 2; |
| s += 2; |
| d += 2; |
| } |
| |
| while (w >= 4 && ((unsigned long)d & 15)) |
| { |
| *(uint32_t *)d = *(uint32_t *)s; |
| |
| w -= 4; |
| s += 4; |
| d += 4; |
| } |
| |
| cachePrefetchNext ((__m128i*)s); |
| cachePrefetchNext ((__m128i*)d); |
| |
| while (w >= 64) |
| { |
| /* 128 bytes ahead */ |
| cachePrefetch (((__m128i*)s) + 8); |
| cachePrefetch (((__m128i*)d) + 8); |
| |
| __m128i xmm0, xmm1, xmm2, xmm3; |
| |
| xmm0 = load128Unaligned ((__m128i*)(s)); |
| xmm1 = load128Unaligned ((__m128i*)(s+16)); |
| xmm2 = load128Unaligned ((__m128i*)(s+32)); |
| xmm3 = load128Unaligned ((__m128i*)(s+48)); |
| |
| save128Aligned ((__m128i*)(d), xmm0); |
| save128Aligned ((__m128i*)(d+16), xmm1); |
| save128Aligned ((__m128i*)(d+32), xmm2); |
| save128Aligned ((__m128i*)(d+48), xmm3); |
| |
| s += 64; |
| d += 64; |
| w -= 64; |
| } |
| |
| cachePrefetchNext ((__m128i*)s); |
| cachePrefetchNext ((__m128i*)d); |
| |
| while (w >= 16) |
| { |
| save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) ); |
| |
| w -= 16; |
| d += 16; |
| s += 16; |
| } |
| |
| cachePrefetchNext ((__m128i*)s); |
| cachePrefetchNext ((__m128i*)d); |
| |
| while (w >= 4) |
| { |
| *(uint32_t *)d = *(uint32_t *)s; |
| |
| w -= 4; |
| s += 4; |
| d += 4; |
| } |
| |
| if (w >= 2) |
| { |
| *(uint16_t *)d = *(uint16_t *)s; |
| w -= 2; |
| s += 2; |
| d += 2; |
| } |
| } |
| |
| _mm_empty(); |
| |
| return TRUE; |
| } |
| |
| void |
| fbCompositeCopyAreasse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| pixmanBltsse2 (pSrc->bits.bits, |
| pDst->bits.bits, |
| pSrc->bits.rowstride, |
| pDst->bits.rowstride, |
| PIXMAN_FORMAT_BPP (pSrc->bits.format), |
| PIXMAN_FORMAT_BPP (pDst->bits.format), |
| xSrc, ySrc, xDst, yDst, width, height); |
| } |
| |
| #if 0 |
| /* This code are buggy in MMX version, now the bug was translated to SSE2 version */ |
| void |
| fbCompositeOver_x888x8x8888sse2 (pixman_op_t op, |
| pixman_image_t * pSrc, |
| pixman_image_t * pMask, |
| pixman_image_t * pDst, |
| int16_t xSrc, |
| int16_t ySrc, |
| int16_t xMask, |
| int16_t yMask, |
| int16_t xDst, |
| int16_t yDst, |
| uint16_t width, |
| uint16_t height) |
| { |
| uint32_t *src, *srcLine, s; |
| uint32_t *dst, *dstLine, d; |
| uint8_t *mask, *maskLine; |
| uint32_t m; |
| int srcStride, maskStride, dstStride; |
| uint16_t w; |
| |
| __m128i xmmSrc, xmmSrcLo, xmmSrcHi; |
| __m128i xmmDst, xmmDstLo, xmmDstHi; |
| __m128i xmmMask, xmmMaskLo, xmmMaskHi; |
| |
| fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1); |
| fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1); |
| fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1); |
| |
| while (height--) |
| { |
| src = srcLine; |
| srcLine += srcStride; |
| dst = dstLine; |
| dstLine += dstStride; |
| mask = maskLine; |
| maskLine += maskStride; |
| |
| w = width; |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| cachePrefetch ((__m128i*)mask); |
| |
| while (w && (unsigned long)dst & 15) |
| { |
| s = 0xff000000 | *src++; |
| m = (uint32_t) *mask++; |
| d = *dst; |
| |
| __m64 ms = unpack_32_1x64 (s); |
| |
| if (m != 0xff) |
| { |
| ms = inOver_1x64 (ms, |
| xMask00ff, |
| expandAlphaRev_1x64 (unpack_32_1x64 (m)), |
| unpack_32_1x64 (d)); |
| } |
| |
| *dst++ = pack_1x64_32 (ms); |
| w--; |
| } |
| |
| /* call prefetch hint to optimize cache load*/ |
| cachePrefetch ((__m128i*)src); |
| cachePrefetch ((__m128i*)dst); |
| cachePrefetch ((__m128i*)mask); |
| |
| while (w >= 4) |
| { |
| /* fill cache line with next memory */ |
| cachePrefetchNext ((__m128i*)src); |
| cachePrefetchNext ((__m128i*)dst); |
| cachePrefetchNext ((__m128i*)mask); |
| |
| m = *(uint32_t*) mask; |
| xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000); |
| |
| if (m == 0xffffffff) |
| { |
| save128Aligned ((__m128i*)dst, xmmSrc); |
| } |
| else |
| { |
| xmmDst = load128Aligned ((__m128i*)dst); |
| |
| xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
| |
| unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi); |
| unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi); |
| unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi); |
| |
| expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi); |
| |
| inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi); |
| |
| save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi)); |
| } |
| |
| src += 4; |
| dst += 4; |
| mask += 4; |
| w -= 4; |
| } |
| |
| while (w) |
| { |
| m = (uint32_t) *mask++; |
| |
| if (m) |
| { |
| s = 0xff000000 | *src; |
| |
| if (m == 0xff) |
| { |
| *dst = s; |
| } |
| else |
| { |
| d = *dst; |
| |
| *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s), |
| xMask00ff, |
| expandAlphaRev_1x64 (unpack_32_1x64 (m)), |
| unpack_32_1x64 (d))); |
| } |
| |
| } |
| |
| src++; |
| dst++; |
| w--; |
| } |
| } |
| |
| _mm_empty(); |
| } |
| #endif /* #if 0 */ |
| |
| #endif /* USE_SSE2 */ |