mmx
diff --git a/qcmsint.h b/qcmsint.h
index c5d12a2..f9f5d06 100644
--- a/qcmsint.h
+++ b/qcmsint.h
@@ -281,6 +281,11 @@
unsigned char *src,
unsigned char *dest,
size_t length);
+void qcms_transform_data_rgb_out_lut_mmx(qcms_transform *transform,
+ unsigned char *src,
+ unsigned char *dest,
+ size_t length);
+
void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
unsigned char *src,
diff --git a/transform-sse1.c b/transform-sse1.c
index ecca4cc..acf16da 100644
--- a/transform-sse1.c
+++ b/transform-sse1.c
@@ -273,3 +273,150 @@
_mm_empty();
}
+
+
+#define ONE_SHIFT 14
+static const ALIGN float fixScaleX4[4] =
+ { 1<<ONE_SHIFT, 1<<ONE_SHIFT, 1<<ONE_SHIFT, 1<<ONE_SHIFT};
+
+void qcms_transform_data_rgb_out_lut_mmx(qcms_transform *transform,
+ unsigned char *src,
+ unsigned char *dest,
+ size_t length)
+{
+ unsigned int i;
+ int (*mat)[4] = transform->matrix;
+ char input_back[32];
+ /* Ensure we have a buffer that's 16 byte aligned regardless of the original
+ * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
+ * because they don't work on stack variables. gcc 4.4 does do the right thing
+ * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
+ float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+ /* share input and output locations to save having to keep the
+ * locations in separate registers */
+ uint32_t const * output = (uint32_t*)input;
+
+ /* deref *transform now to avoid it in loop */
+ const int *igtbl_r = transform->input_gamma_table_r;
+ const int *igtbl_g = transform->input_gamma_table_g;
+ const int *igtbl_b = transform->input_gamma_table_b;
+
+ /* deref *transform now to avoid it in loop */
+ const uint8_t *otdata_r = &transform->output_table_r->data[0];
+ const uint8_t *otdata_g = &transform->output_table_g->data[0];
+ const uint8_t *otdata_b = &transform->output_table_b->data[0];
+
+ /* these values don't change, either */
+ const __m64 max = _mm_set1_pi16(4096);
+ const __m64 min = _mm_setzero_si64();
+ const __m64 half = _mm_set1_pi32(1<<(ONE_SHIFT+(ONE_SHIFT-12)-1));
+
+ /* input matrix values never change */
+ const __m64 mat0_lo = *(__m64*)&(mat[0][0]);
+ const __m64 mat0_hi = *(__m64*)&(mat[0][2]);
+ const __m64 mat1_lo = *(__m64*)&(mat[1][0]);
+ const __m64 mat1_hi = *(__m64*)&(mat[1][2]);
+ const __m64 mat01_lo = _mm_unpacklo_pi16(_mm_packs_pi32(mat0_lo, mat0_lo), _mm_packs_pi32(mat1_lo, mat1_lo));
+ const __m64 mat01_hi = _mm_unpacklo_pi16(_mm_packs_pi32(mat0_hi, mat0_hi), _mm_packs_pi32(mat1_hi, mat1_hi));
+ const __m64 mat2_lo = *(__m64*)&(mat[2][0]);
+ const __m64 mat2_hi = *(__m64*)&(mat[2][2]);
+
+ /* working variables */
+ __m64 vec_r_lo, vec_g_lo, vec_b_lo, result_lo;
+ __m64 vec_r_hi, vec_g_hi, vec_b_hi, result_hi;
+
+ /* CYA */
+ if (!length)
+ return;
+
+ /* one pixel is handled outside of the loop */
+ length--;
+
+ /* setup for transforming 1st pixel */
+ vec_r_lo = _mm_set1_pi16(igtbl_r[src[0]]);
+ vec_r_hi = _mm_set1_pi16(igtbl_r[src[0]]);
+ vec_g_lo = _mm_set1_pi16(igtbl_g[src[1]]);
+ vec_g_hi = _mm_set1_pi16(igtbl_g[src[1]]);
+ vec_b_lo = _mm_set1_pi32(igtbl_b[src[2]]);
+ vec_b_hi = _mm_set1_pi32(igtbl_b[src[2]]);
+ src += 3;
+
+ /* transform all but final pixel */
+
+ for (i=0; i<length; i++)
+ {
+ /* position values from gamma tables */
+ __m64 vec_rg_lo, vec_rg_hi;
+ vec_rg_lo = vec_rg_hi = _mm_unpacklo_pi16(vec_r_lo, vec_g_lo);
+ /* gamma * matrix */
+ vec_rg_lo = _mm_madd_pi16(vec_rg_lo, mat01_lo);
+ vec_rg_hi = _mm_madd_pi16(vec_rg_hi, mat01_hi);
+ vec_b_lo = _mm_madd_pi16(vec_b_lo, mat2_lo);
+ vec_b_hi = _mm_madd_pi16(vec_b_hi, mat2_hi);
+
+ __m64 vec_r1 = _mm_add_pi32(vec_b_lo, vec_rg_lo);
+ vec_r1 = _mm_add_pi32(vec_r1, half);
+
+ __m64 vec_r2 = _mm_add_pi32(vec_b_hi, vec_rg_hi);
+ vec_r2 = _mm_add_pi32(vec_r2, half);
+
+ vec_r1 = _mm_srai_pi32(vec_r1, ONE_SHIFT+(ONE_SHIFT-12));
+ vec_r2 = _mm_srai_pi32(vec_r2, ONE_SHIFT+(ONE_SHIFT-12));
+ vec_r1 = _mm_max_pi16(min, vec_r1);
+ vec_r2 = _mm_max_pi16(min, vec_r2);
+ result_lo = _mm_min_pi16(max, vec_r1);
+ result_hi = _mm_min_pi16(max, vec_r2);
+
+ /* store calc'd output tables indices */
+ *((__m64*)output) = result_lo;
+ *(__m64*)(output+2) = result_hi;
+
+ /* load for next loop while store completes */
+ vec_r_hi = vec_r_lo = _mm_set1_pi16(igtbl_r[src[0]]);
+ vec_g_hi = vec_g_lo = _mm_set1_pi16(igtbl_g[src[1]]);
+ vec_b_hi = vec_b_lo = _mm_set1_pi32(igtbl_b[src[2]]);
+ src += 3;
+
+ /* use calc'd indices to output RGB values */
+ dest[0] = otdata_r[output[0]];
+ dest[1] = otdata_g[output[1]];
+ dest[2] = otdata_b[output[2]];
+ dest += 3;
+ }
+
+ /* handle final (maybe only) pixel */
+
+ __m64 vec_rg_lo = _mm_unpacklo_pi16(vec_r_lo, vec_g_lo);
+ __m64 vec_rg_hi = _mm_unpacklo_pi16(vec_r_hi, vec_g_hi);
+
+ /* gamma * matrix */
+ vec_rg_lo = _mm_madd_pi16(vec_rg_lo, mat01_lo);
+ vec_rg_hi = _mm_madd_pi16(vec_rg_hi, mat01_hi);
+ vec_b_lo = _mm_madd_pi16(vec_b_lo, mat2_lo);
+ vec_b_hi = _mm_madd_pi16(vec_b_hi, mat2_hi);
+
+ __m64 vec_r1 = _mm_add_pi32(vec_b_lo, vec_rg_lo);
+ vec_r1 = _mm_add_pi32(vec_r1, half);
+
+ __m64 vec_r2 = _mm_add_pi32(vec_b_hi, vec_rg_hi);
+ vec_r2 = _mm_add_pi32(vec_r2, half);
+
+ vec_r1 = _mm_srai_pi32(vec_r1, ONE_SHIFT+(ONE_SHIFT-12));
+ vec_r2 = _mm_srai_pi32(vec_r2, ONE_SHIFT+(ONE_SHIFT-12));
+ vec_r1 = _mm_max_pi16(min, vec_r1);
+ vec_r2 = _mm_max_pi16(min, vec_r2);
+ result_lo = _mm_min_pi16(max, vec_r1);
+ result_hi = _mm_min_pi16(max, vec_r2);
+
+ /* store calc'd output tables indices */
+ *(__m64*)output = result_lo;
+ *(__m64*)(output+2) = result_hi;
+
+ _mm_empty();
+
+ dest[0] = otdata_r[output[0]];
+ dest[1] = otdata_g[output[1]];
+ dest[2] = otdata_b[output[2]];
+}
+
+
diff --git a/transform.c b/transform.c
index 025271d..f55e074 100644
--- a/transform.c
+++ b/transform.c
@@ -988,7 +988,6 @@
static int sse_version_available(void)
{
- return 0;
#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
/* we know at build time that 64-bit CPUs always have SSE2
* this tells the compiler that non-SSE2 branches will never be
@@ -1214,7 +1213,7 @@
#ifdef X86
if (sse_version_available() >= 2) {
if (in_type == QCMS_DATA_RGB_8)
- transform->transform_fn = qcms_transform_data_rgb_out_lut_sse41_int;
+ transform->transform_fn = qcms_transform_data_rgb_out_lut_mmx;
else
transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;