mmx
diff --git a/qcmsint.h b/qcmsint.h
index c5d12a2..f9f5d06 100644
--- a/qcmsint.h
+++ b/qcmsint.h
@@ -281,6 +281,11 @@
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length);
+void qcms_transform_data_rgb_out_lut_mmx(qcms_transform *transform,
+                                          unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length);
+
 
 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
                                           unsigned char *src,
diff --git a/transform-sse1.c b/transform-sse1.c
index ecca4cc..acf16da 100644
--- a/transform-sse1.c
+++ b/transform-sse1.c
@@ -273,3 +273,150 @@
 
     _mm_empty();
 }
+
+
+#define ONE_SHIFT 14
+static const ALIGN float fixScaleX4[4] =
+    { 1<<ONE_SHIFT, 1<<ONE_SHIFT, 1<<ONE_SHIFT, 1<<ONE_SHIFT};
+
+void qcms_transform_data_rgb_out_lut_mmx(qcms_transform *transform,
+                                          unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length)
+{
+    unsigned int i;
+    int (*mat)[4] = transform->matrix;
+    char input_back[32];
+    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
+     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
+     * because they don't work on stack variables. gcc 4.4 does do the right thing
+     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
+    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+    /* share input and output locations to save having to keep the
+     * locations in separate registers */
+    uint32_t const * output = (uint32_t*)input;
+
+    /* deref *transform now to avoid it in loop */
+    const int *igtbl_r = transform->input_gamma_table_r;
+    const int *igtbl_g = transform->input_gamma_table_g;
+    const int *igtbl_b = transform->input_gamma_table_b;
+
+    /* deref *transform now to avoid it in loop */
+    const uint8_t *otdata_r = &transform->output_table_r->data[0];
+    const uint8_t *otdata_g = &transform->output_table_g->data[0];
+    const uint8_t *otdata_b = &transform->output_table_b->data[0];
+
+    /* these values don't change, either */
+    const __m64 max   = _mm_set1_pi16(4096);
+    const __m64 min   = _mm_setzero_si64();
+    const __m64 half  = _mm_set1_pi32(1<<(ONE_SHIFT+(ONE_SHIFT-12)-1));
+
+    /* input matrix values never change */
+    const __m64 mat0_lo  = *(__m64*)&(mat[0][0]);
+    const __m64 mat0_hi  = *(__m64*)&(mat[0][2]);
+    const __m64 mat1_lo  = *(__m64*)&(mat[1][0]);
+    const __m64 mat1_hi  = *(__m64*)&(mat[1][2]);
+    const __m64 mat01_lo = _mm_unpacklo_pi16(_mm_packs_pi32(mat0_lo, mat0_lo), _mm_packs_pi32(mat1_lo, mat1_lo));
+    const __m64 mat01_hi = _mm_unpacklo_pi16(_mm_packs_pi32(mat0_hi, mat0_hi), _mm_packs_pi32(mat1_hi, mat1_hi));
+    const __m64 mat2_lo  = *(__m64*)&(mat[2][0]);
+    const __m64 mat2_hi  = *(__m64*)&(mat[2][2]);
+
+    /* working variables */
+    __m64 vec_r_lo, vec_g_lo, vec_b_lo, result_lo;
+    __m64 vec_r_hi, vec_g_hi, vec_b_hi, result_hi;
+
+    /* CYA */
+    if (!length)
+        return;
+
+    /* one pixel is handled outside of the loop */
+    length--;
+
+    /* setup for transforming 1st pixel */
+    vec_r_lo = _mm_set1_pi16(igtbl_r[src[0]]);
+    vec_r_hi = _mm_set1_pi16(igtbl_r[src[0]]);
+    vec_g_lo = _mm_set1_pi16(igtbl_g[src[1]]);
+    vec_g_hi = _mm_set1_pi16(igtbl_g[src[1]]);
+    vec_b_lo = _mm_set1_pi32(igtbl_b[src[2]]);
+    vec_b_hi = _mm_set1_pi32(igtbl_b[src[2]]);
+    src += 3;
+
+    /* transform all but final pixel */
+
+    for (i=0; i<length; i++)
+    {
+        /* position values from gamma tables */
+        __m64 vec_rg_lo, vec_rg_hi;
+	vec_rg_lo = vec_rg_hi = _mm_unpacklo_pi16(vec_r_lo, vec_g_lo);
+        /* gamma * matrix */
+        vec_rg_lo = _mm_madd_pi16(vec_rg_lo, mat01_lo);
+        vec_rg_hi = _mm_madd_pi16(vec_rg_hi, mat01_hi);
+        vec_b_lo = _mm_madd_pi16(vec_b_lo, mat2_lo);
+        vec_b_hi = _mm_madd_pi16(vec_b_hi, mat2_hi);
+
+        __m64 vec_r1  = _mm_add_pi32(vec_b_lo, vec_rg_lo);
+	vec_r1  = _mm_add_pi32(vec_r1, half);
+
+	__m64 vec_r2  = _mm_add_pi32(vec_b_hi, vec_rg_hi);
+	vec_r2  = _mm_add_pi32(vec_r2, half);
+
+	vec_r1  = _mm_srai_pi32(vec_r1, ONE_SHIFT+(ONE_SHIFT-12));
+        vec_r2  = _mm_srai_pi32(vec_r2, ONE_SHIFT+(ONE_SHIFT-12));
+        vec_r1 = _mm_max_pi16(min, vec_r1);
+        vec_r2 = _mm_max_pi16(min, vec_r2);
+        result_lo = _mm_min_pi16(max, vec_r1);
+        result_hi = _mm_min_pi16(max, vec_r2);
+
+        /* store calc'd output tables indices */
+        *((__m64*)output) = result_lo;
+        *(__m64*)(output+2) =  result_hi;
+
+        /* load for next loop while store completes */
+	vec_r_hi = vec_r_lo = _mm_set1_pi16(igtbl_r[src[0]]);
+	vec_g_hi = vec_g_lo = _mm_set1_pi16(igtbl_g[src[1]]);
+	vec_b_hi = vec_b_lo = _mm_set1_pi32(igtbl_b[src[2]]);
+	src += 3;
+
+        /* use calc'd indices to output RGB values */
+        dest[0] = otdata_r[output[0]];
+        dest[1] = otdata_g[output[1]];
+        dest[2] = otdata_b[output[2]];
+        dest += 3;
+    }
+
+    /* handle final (maybe only) pixel */
+
+    __m64 vec_rg_lo = _mm_unpacklo_pi16(vec_r_lo, vec_g_lo);
+    __m64 vec_rg_hi = _mm_unpacklo_pi16(vec_r_hi, vec_g_hi);
+
+    /* gamma * matrix */
+    vec_rg_lo = _mm_madd_pi16(vec_rg_lo, mat01_lo);
+    vec_rg_hi = _mm_madd_pi16(vec_rg_hi, mat01_hi);
+    vec_b_lo = _mm_madd_pi16(vec_b_lo, mat2_lo);
+    vec_b_hi = _mm_madd_pi16(vec_b_hi, mat2_hi);
+
+    __m64 vec_r1  = _mm_add_pi32(vec_b_lo, vec_rg_lo);
+    vec_r1  = _mm_add_pi32(vec_r1, half);
+
+    __m64 vec_r2  = _mm_add_pi32(vec_b_hi, vec_rg_hi);
+    vec_r2  = _mm_add_pi32(vec_r2, half);
+
+    vec_r1  = _mm_srai_pi32(vec_r1, ONE_SHIFT+(ONE_SHIFT-12));
+    vec_r2  = _mm_srai_pi32(vec_r2, ONE_SHIFT+(ONE_SHIFT-12));
+    vec_r1 = _mm_max_pi16(min, vec_r1);
+    vec_r2 = _mm_max_pi16(min, vec_r2);
+    result_lo = _mm_min_pi16(max, vec_r1);
+    result_hi = _mm_min_pi16(max, vec_r2);
+
+    /* store calc'd output tables indices */
+    *(__m64*)output = result_lo;
+    *(__m64*)(output+2) =  result_hi;
+
+    _mm_empty();
+
+    dest[0] = otdata_r[output[0]];
+    dest[1] = otdata_g[output[1]];
+    dest[2] = otdata_b[output[2]];
+}
+
+
diff --git a/transform.c b/transform.c
index 025271d..f55e074 100644
--- a/transform.c
+++ b/transform.c
@@ -988,7 +988,6 @@
 
 static int sse_version_available(void)
 {
-	return 0;
 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
 	/* we know at build time that 64-bit CPUs always have SSE2
 	 * this tells the compiler that non-SSE2 branches will never be
@@ -1214,7 +1213,7 @@
 #ifdef X86
 		    if (sse_version_available() >= 2) {
 			    if (in_type == QCMS_DATA_RGB_8)
-				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse41_int;
+				    transform->transform_fn = qcms_transform_data_rgb_out_lut_mmx;
 			    else
 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;