| /* The gcc-provided loongson intrinsic functions are way too fucking broken |
| * to be of any use, otherwise I'd use them. |
| * |
| * - The hardware instructions are very similar to MMX or iwMMXt. Certainly |
| * close enough that they could have implemented the _mm_*-style intrinsic |
| * interface and had a ton of optimized code available to them. Instead they |
| * implemented something much, much worse. |
| * |
| * - pshuf takes a dead first argument, causing extra instructions to be |
| * generated. |
| * |
| * - There are no 64-bit shift or logical intrinsics, which means you have |
| * to implement them with inline assembly, but this is a nightmare because |
| * gcc doesn't understand that the integer vector datatypes are actually in |
| * floating-point registers, so you end up with braindead code like |
| * |
| * punpcklwd $f9,$f9,$f5 |
| * dmtc1 v0,$f8 |
| * punpcklwd $f19,$f19,$f5 |
| * dmfc1 t9,$f9 |
| * dmtc1 v0,$f9 |
| * dmtc1 t9,$f20 |
| * dmfc1 s0,$f19 |
| * punpcklbh $f20,$f20,$f2 |
| * |
| * where crap just gets copied back and forth between integer and floating- |
| * point registers ad nauseum. |
| * |
| * Instead of trying to workaround the problems from these crap intrinsics, I |
| * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline |
| * assembly. |
| */ |
| |
| #include <stdint.h> |
| |
| /* vectors are stored in 64-bit floating-point registers */ |
| typedef double __m64; |
| /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */ |
| typedef float __m32; |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_setzero_si64 (void) |
| { |
| return 0.0; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_adds_pu16 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("paddush %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_adds_pu8 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("paddusb %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_and_si64 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("and %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_empty (void) |
| { |
| |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_madd_pi16 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("pmaddhw %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_mulhi_pu16 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("pmulhuh %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_mullo_pi16 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("pmullh %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_or_si64 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("or %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_packs_pu16 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("packushb %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_shuffle_pi16 (__m64 __m, int64_t __n) |
| { |
| __m64 ret; |
| asm("pshufh %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m), "f" (*(__m64 *)&__n) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_slli_si64 (__m64 __m, int64_t __count) |
| { |
| __m64 ret; |
| asm("dsll %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m), "f" (*(__m64 *)&__count) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_srli_pi16 (__m64 __m, int64_t __count) |
| { |
| __m64 ret; |
| asm("psrlh %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m), "f" (*(__m64 *)&__count) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_srli_si64 (__m64 __m, int64_t __count) |
| { |
| __m64 ret; |
| asm("dsrl %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m), "f" (*(__m64 *)&__count) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("punpckhbh %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("punpckhhw %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("punpcklbh %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which |
| * allows load8888 to use 32-bit loads */ |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("punpcklbh %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("punpcklhw %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| _mm_xor_si64 (__m64 __m1, __m64 __m2) |
| { |
| __m64 ret; |
| asm("xor %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| loongson_extract_pi16 (__m64 __m, int64_t __pos) |
| { |
| __m64 ret; |
| asm("pextrh %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m), "f" (*(__m64 *)&__pos) |
| ); |
| return ret; |
| } |
| |
| extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
| loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos) |
| { |
| __m64 ret; |
| asm("pinsrh_%3 %0, %1, %2\n\t" |
| : "=f" (ret) |
| : "f" (__m1), "f" (__m2), "i" (__pos) |
| ); |
| return ret; |
| } |