Improve SSE2 performance, add SSE support. r=jrmuizel

This patch greatly improves the performance of QCMS transformations on x86 &
x86_64 systems.  Some notes:

0. On 32-bit x86 systems it does runtime selection between non-SIMD, SSE, and
SSE2 code paths.

1. On x86_64 systems the SSE2 code path is always taken.  The non-SIMD and SSE
code paths are left intact, but contemporary versions of the GCC and MSVC
compilers will see that they cannot be reached and optimize them away.

2. The execution of the SSE2 code path is reduced by 67%, relative to the
original Intel/Microsoft formatted ASM code.  The relative performance is seen
on a Pentium4 (Northwood) 2.4GHz CPU with DDR1 RAM.

3. The SSE code path provides a 80% reduction in execution time, relative to
the non-SIMD code path.  The relative performance is seen on a Pentium3
(Coppermine) 1.26GHz CPU with SDRAM.

4. The code has been split out into separate files so that it can be built
with different cflags (-msse, and -msse2) when using gcc.
diff --git a/Makefile b/Makefile
index b5880cc..042761e 100644
--- a/Makefile
+++ b/Makefile
@@ -6,8 +6,8 @@
 CFLAGS=-Wall $(OPT_FLAGS) $(COVERAGE_FLAGS) -Wdeclaration-after-statement -ggdb `pkg-config --cflags lcms`
 LDFLAGS=`pkg-config --libs lcms` -ldl
 
-QCMS_SRC=iccread.c transform.c
-QCMS_OBJS=iccread.o transform.o
+QCMS_SRC=iccread.c transform.c transform-sse2.c transform-sse1.c
+QCMS_OBJS=iccread.o transform.o transform-sse2.o transform-sse1.o
 
 PROGRAMS=profile-gen test test-invalid lcms-compare dump-profile div-test coverage malloc-fail invalid-coverage
 
diff --git a/qcmsint.h b/qcmsint.h
index 68bbe21..078ffcb 100644
--- a/qcmsint.h
+++ b/qcmsint.h
@@ -141,3 +141,20 @@
 
 void precache_release(struct precache_output *p);
 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries);
+
+void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
+                                          unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length);
+void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
+                                          unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length);
+void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
+                                          unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length);
+void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
+                                          unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length);
diff --git a/transform-sse1.c b/transform-sse1.c
new file mode 100644
index 0000000..59affa7
--- /dev/null
+++ b/transform-sse1.c
@@ -0,0 +1,253 @@
+#include <xmmintrin.h>
+
+#include "qcmsint.h"
+
+/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
+#define FLOATSCALE  65536.0f
+#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
+static const ALIGN float floatScaleX4[4] =
+    { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
+static const ALIGN float clampMaxValueX4[4] =
+    { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
+
+void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
+                                          unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length)
+{
+    unsigned int i;
+    float (*mat)[4] = transform->matrix;
+    char input_back[32];
+    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
+     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
+     * because they don't work on stack variables. gcc 4.4 does do the right thing
+     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
+    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+    /* share input and output locations to save having to keep the
+     * locations in separate registers */
+    uint32_t const * output = (uint32_t*)input;
+
+    /* deref *transform now to avoid it in loop */
+    const float *igtbl_r = transform->input_gamma_table_r;
+    const float *igtbl_g = transform->input_gamma_table_g;
+    const float *igtbl_b = transform->input_gamma_table_b;
+
+    /* deref *transform now to avoid it in loop */
+    const uint8_t *otdata_r = &transform->output_table_r->data[0];
+    const uint8_t *otdata_g = &transform->output_table_g->data[0];
+    const uint8_t *otdata_b = &transform->output_table_b->data[0];
+
+    /* input matrix values never change */
+    const __m128 mat0  = _mm_load_ps(mat[0]);
+    const __m128 mat1  = _mm_load_ps(mat[1]);
+    const __m128 mat2  = _mm_load_ps(mat[2]);
+
+    /* these values don't change, either */
+    const __m128 max   = _mm_load_ps(clampMaxValueX4);
+    const __m128 min   = _mm_setzero_ps();
+    const __m128 scale = _mm_load_ps(floatScaleX4);
+
+    /* working variables */
+    __m128 vec_r, vec_g, vec_b, result;
+
+    /* CYA */
+    if (!length)
+        return;
+
+    /* one pixel is handled outside of the loop */
+    length--;
+
+    /* setup for transforming 1st pixel */
+    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
+    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
+    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
+    src += 3;
+
+    /* transform all but final pixel */
+
+    for (i=0; i<length; i++)
+    {
+        /* position values from gamma tables */
+        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+        /* gamma * matrix */
+        vec_r = _mm_mul_ps(vec_r, mat0);
+        vec_g = _mm_mul_ps(vec_g, mat1);
+        vec_b = _mm_mul_ps(vec_b, mat2);
+
+        /* crunch, crunch, crunch */
+        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+        vec_r  = _mm_max_ps(min, vec_r);
+        vec_r  = _mm_min_ps(max, vec_r);
+        result = _mm_mul_ps(vec_r, scale);
+
+        /* store calc'd output tables indices */
+        *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
+        result = _mm_movehl_ps(result, result);
+        *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
+
+        /* load for next loop while store completes */
+        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
+        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
+        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
+        src += 3;
+
+        /* use calc'd indices to output RGB values */
+        dest[0] = otdata_r[output[0]];
+        dest[1] = otdata_g[output[1]];
+        dest[2] = otdata_b[output[2]];
+        dest += 3;
+    }
+
+    /* handle final (maybe only) pixel */
+
+    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+    vec_r = _mm_mul_ps(vec_r, mat0);
+    vec_g = _mm_mul_ps(vec_g, mat1);
+    vec_b = _mm_mul_ps(vec_b, mat2);
+
+    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+    vec_r  = _mm_max_ps(min, vec_r);
+    vec_r  = _mm_min_ps(max, vec_r);
+    result = _mm_mul_ps(vec_r, scale);
+
+    *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
+    result = _mm_movehl_ps(result, result);
+    *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
+
+    dest[0] = otdata_r[output[0]];
+    dest[1] = otdata_g[output[1]];
+    dest[2] = otdata_b[output[2]];
+
+    _mm_empty();
+}
+
+void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
+                                           unsigned char *src,
+                                           unsigned char *dest,
+                                           size_t length)
+{
+    unsigned int i;
+    float (*mat)[4] = transform->matrix;
+    char input_back[32];
+    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
+     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
+     * because they don't work on stack variables. gcc 4.4 does do the right thing
+     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
+    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+    /* share input and output locations to save having to keep the
+     * locations in separate registers */
+    uint32_t const * output = (uint32_t*)input;
+
+    /* deref *transform now to avoid it in loop */
+    const float *igtbl_r = transform->input_gamma_table_r;
+    const float *igtbl_g = transform->input_gamma_table_g;
+    const float *igtbl_b = transform->input_gamma_table_b;
+
+    /* deref *transform now to avoid it in loop */
+    const uint8_t *otdata_r = &transform->output_table_r->data[0];
+    const uint8_t *otdata_g = &transform->output_table_g->data[0];
+    const uint8_t *otdata_b = &transform->output_table_b->data[0];
+
+    /* input matrix values never change */
+    const __m128 mat0  = _mm_load_ps(mat[0]);
+    const __m128 mat1  = _mm_load_ps(mat[1]);
+    const __m128 mat2  = _mm_load_ps(mat[2]);
+
+    /* these values don't change, either */
+    const __m128 max   = _mm_load_ps(clampMaxValueX4);
+    const __m128 min   = _mm_setzero_ps();
+    const __m128 scale = _mm_load_ps(floatScaleX4);
+
+    /* working variables */
+    __m128 vec_r, vec_g, vec_b, result;
+    unsigned char alpha;
+
+    /* CYA */
+    if (!length)
+        return;
+
+    /* one pixel is handled outside of the loop */
+    length--;
+
+    /* setup for transforming 1st pixel */
+    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
+    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
+    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
+    alpha = src[3];
+    src += 4;
+
+    /* transform all but final pixel */
+
+    for (i=0; i<length; i++)
+    {
+        /* position values from gamma tables */
+        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+        /* gamma * matrix */
+        vec_r = _mm_mul_ps(vec_r, mat0);
+        vec_g = _mm_mul_ps(vec_g, mat1);
+        vec_b = _mm_mul_ps(vec_b, mat2);
+
+        /* store alpha for this pixel; load alpha for next */
+        dest[3] = alpha;
+        alpha   = src[3];
+
+        /* crunch, crunch, crunch */
+        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+        vec_r  = _mm_max_ps(min, vec_r);
+        vec_r  = _mm_min_ps(max, vec_r);
+        result = _mm_mul_ps(vec_r, scale);
+
+        /* store calc'd output tables indices */
+        *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
+        result = _mm_movehl_ps(result, result);
+        *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
+
+        /* load gamma values for next loop while store completes */
+        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
+        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
+        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
+        src += 4;
+
+        /* use calc'd indices to output RGB values */
+        dest[0] = otdata_r[output[0]];
+        dest[1] = otdata_g[output[1]];
+        dest[2] = otdata_b[output[2]];
+        dest += 4;
+    }
+
+    /* handle final (maybe only) pixel */
+
+    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+    vec_r = _mm_mul_ps(vec_r, mat0);
+    vec_g = _mm_mul_ps(vec_g, mat1);
+    vec_b = _mm_mul_ps(vec_b, mat2);
+
+    dest[3] = alpha;
+
+    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+    vec_r  = _mm_max_ps(min, vec_r);
+    vec_r  = _mm_min_ps(max, vec_r);
+    result = _mm_mul_ps(vec_r, scale);
+
+    *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
+    result = _mm_movehl_ps(result, result);
+    *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
+
+    dest[0] = otdata_r[output[0]];
+    dest[1] = otdata_g[output[1]];
+    dest[2] = otdata_b[output[2]];
+
+    _mm_empty();
+}
diff --git a/transform-sse2.c b/transform-sse2.c
new file mode 100644
index 0000000..208b4d9
--- /dev/null
+++ b/transform-sse2.c
@@ -0,0 +1,243 @@
+#include <emmintrin.h>
+
+#include "qcmsint.h"
+
+/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
+#define FLOATSCALE  65536.0f
+#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
+static const ALIGN float floatScaleX4[4] =
+    { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
+static const ALIGN float clampMaxValueX4[4] =
+    { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
+
+void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
+                                          unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length)
+{
+    unsigned int i;
+    float (*mat)[4] = transform->matrix;
+    char input_back[32];
+    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
+     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
+     * because they don't work on stack variables. gcc 4.4 does do the right thing
+     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
+    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+    /* share input and output locations to save having to keep the
+     * locations in separate registers */
+    uint32_t const * output = (uint32_t*)input;
+
+    /* deref *transform now to avoid it in loop */
+    const float *igtbl_r = transform->input_gamma_table_r;
+    const float *igtbl_g = transform->input_gamma_table_g;
+    const float *igtbl_b = transform->input_gamma_table_b;
+
+    /* deref *transform now to avoid it in loop */
+    const uint8_t *otdata_r = &transform->output_table_r->data[0];
+    const uint8_t *otdata_g = &transform->output_table_g->data[0];
+    const uint8_t *otdata_b = &transform->output_table_b->data[0];
+
+    /* input matrix values never change */
+    const __m128 mat0  = _mm_load_ps(mat[0]);
+    const __m128 mat1  = _mm_load_ps(mat[1]);
+    const __m128 mat2  = _mm_load_ps(mat[2]);
+
+    /* these values don't change, either */
+    const __m128 max   = _mm_load_ps(clampMaxValueX4);
+    const __m128 min   = _mm_setzero_ps();
+    const __m128 scale = _mm_load_ps(floatScaleX4);
+
+    /* working variables */
+    __m128 vec_r, vec_g, vec_b, result;
+
+    /* CYA */
+    if (!length)
+        return;
+
+    /* one pixel is handled outside of the loop */
+    length--;
+
+    /* setup for transforming 1st pixel */
+    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
+    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
+    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
+    src += 3;
+
+    /* transform all but final pixel */
+
+    for (i=0; i<length; i++)
+    {
+        /* position values from gamma tables */
+        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+        /* gamma * matrix */
+        vec_r = _mm_mul_ps(vec_r, mat0);
+        vec_g = _mm_mul_ps(vec_g, mat1);
+        vec_b = _mm_mul_ps(vec_b, mat2);
+
+        /* crunch, crunch, crunch */
+        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+        vec_r  = _mm_max_ps(min, vec_r);
+        vec_r  = _mm_min_ps(max, vec_r);
+        result = _mm_mul_ps(vec_r, scale);
+
+        /* store calc'd output tables indices */
+        _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
+
+        /* load for next loop while store completes */
+        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
+        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
+        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
+        src += 3;
+
+        /* use calc'd indices to output RGB values */
+        dest[0] = otdata_r[output[0]];
+        dest[1] = otdata_g[output[1]];
+        dest[2] = otdata_b[output[2]];
+        dest += 3;
+    }
+
+    /* handle final (maybe only) pixel */
+
+    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+    vec_r = _mm_mul_ps(vec_r, mat0);
+    vec_g = _mm_mul_ps(vec_g, mat1);
+    vec_b = _mm_mul_ps(vec_b, mat2);
+
+    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+    vec_r  = _mm_max_ps(min, vec_r);
+    vec_r  = _mm_min_ps(max, vec_r);
+    result = _mm_mul_ps(vec_r, scale);
+
+    _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
+
+    dest[0] = otdata_r[output[0]];
+    dest[1] = otdata_g[output[1]];
+    dest[2] = otdata_b[output[2]];
+}
+
+void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
+                                           unsigned char *src,
+                                           unsigned char *dest,
+                                           size_t length)
+{
+    unsigned int i;
+    float (*mat)[4] = transform->matrix;
+    char input_back[32];
+    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
+     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
+     * because they don't work on stack variables. gcc 4.4 does do the right thing
+     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
+    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
+    /* share input and output locations to save having to keep the
+     * locations in separate registers */
+    uint32_t const * output = (uint32_t*)input;
+
+    /* deref *transform now to avoid it in loop */
+    const float *igtbl_r = transform->input_gamma_table_r;
+    const float *igtbl_g = transform->input_gamma_table_g;
+    const float *igtbl_b = transform->input_gamma_table_b;
+
+    /* deref *transform now to avoid it in loop */
+    const uint8_t *otdata_r = &transform->output_table_r->data[0];
+    const uint8_t *otdata_g = &transform->output_table_g->data[0];
+    const uint8_t *otdata_b = &transform->output_table_b->data[0];
+
+    /* input matrix values never change */
+    const __m128 mat0  = _mm_load_ps(mat[0]);
+    const __m128 mat1  = _mm_load_ps(mat[1]);
+    const __m128 mat2  = _mm_load_ps(mat[2]);
+
+    /* these values don't change, either */
+    const __m128 max   = _mm_load_ps(clampMaxValueX4);
+    const __m128 min   = _mm_setzero_ps();
+    const __m128 scale = _mm_load_ps(floatScaleX4);
+
+    /* working variables */
+    __m128 vec_r, vec_g, vec_b, result;
+    unsigned char alpha;
+
+    /* CYA */
+    if (!length)
+        return;
+
+    /* one pixel is handled outside of the loop */
+    length--;
+
+    /* setup for transforming 1st pixel */
+    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
+    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
+    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
+    alpha = src[3];
+    src += 4;
+
+    /* transform all but final pixel */
+
+    for (i=0; i<length; i++)
+    {
+        /* position values from gamma tables */
+        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+        /* gamma * matrix */
+        vec_r = _mm_mul_ps(vec_r, mat0);
+        vec_g = _mm_mul_ps(vec_g, mat1);
+        vec_b = _mm_mul_ps(vec_b, mat2);
+
+        /* store alpha for this pixel; load alpha for next */
+        dest[3] = alpha;
+        alpha   = src[3];
+
+        /* crunch, crunch, crunch */
+        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+        vec_r  = _mm_max_ps(min, vec_r);
+        vec_r  = _mm_min_ps(max, vec_r);
+        result = _mm_mul_ps(vec_r, scale);
+
+        /* store calc'd output tables indices */
+        _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
+
+        /* load gamma values for next loop while store completes */
+        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
+        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
+        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
+        src += 4;
+
+        /* use calc'd indices to output RGB values */
+        dest[0] = otdata_r[output[0]];
+        dest[1] = otdata_g[output[1]];
+        dest[2] = otdata_b[output[2]];
+        dest += 4;
+    }
+
+    /* handle final (maybe only) pixel */
+
+    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
+    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
+    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
+
+    vec_r = _mm_mul_ps(vec_r, mat0);
+    vec_g = _mm_mul_ps(vec_g, mat1);
+    vec_b = _mm_mul_ps(vec_b, mat2);
+
+    dest[3] = alpha;
+
+    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
+    vec_r  = _mm_max_ps(min, vec_r);
+    vec_r  = _mm_min_ps(max, vec_r);
+    result = _mm_mul_ps(vec_r, scale);
+
+    _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
+
+    dest[0] = otdata_r[output[0]];
+    dest[1] = otdata_g[output[1]];
+    dest[2] = otdata_b[output[2]];
+}
+
+
diff --git a/transform.c b/transform.c
index 727622c..d9fd089 100644
--- a/transform.c
+++ b/transform.c
@@ -25,9 +25,10 @@
 #include <assert.h>
 #include "qcmsint.h"
 
-#if defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64)
+/* for MSVC, GCC, and Intel compilers */
+#if defined(_M_IX86) || defined(__i386__) || defined(_M_AMD64) || defined(__x86_64__)
 #define X86
-#endif
+#endif /* _M_IX86 || __i386__ || _M_AMD64 || __x86_64__ */
 
 //XXX: could use a bettername
 typedef uint16_t uint16_fract_t;
@@ -734,352 +735,6 @@
 	}
 }
 
-static const ALIGN float floatScale = 65536.0f;
-static const ALIGN float * const floatScaleAddr = &floatScale; // Win32 ASM doesn't know how to take addressOf inline
-
-static const ALIGN float clampMaxValue = ((float) (65536 - 1)) / 65536.0f;
-
-#ifdef X86
-#if 0
-#include <emmintrin.h>
-void qcms_transform_data_rgb_out_lut_sse_intrin(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
-{
-	int i;
-	float (*mat)[4] = transform->matrix;
-        char input_back[32];
-	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
-	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-	 * because they don't work on stack variables. gcc 4.4 does do the right thing 
-	 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-        float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-        /* share input and output locations to save having to keep the
-         * locations in separate registers */
-        uint32_t* output = (uint32_t*)input;
-	for (i=0; i<length; i++) {
-		const float *clampMax = &clampMaxValue;
-
-		unsigned char device_r = *src++;
-		unsigned char device_g = *src++;
-		unsigned char device_b = *src++;
-
-		__m128 xmm1 = _mm_load_ps(mat[0]);
-		__m128 xmm2 = _mm_load_ps(mat[1]);
-		__m128 xmm3 = _mm_load_ps(mat[2]);
-
-		__m128 vec_r = _mm_load_ss(&transform->input_gamma_table_r[device_r]);
-		vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-		__m128 vec_g = _mm_load_ss(&transform->input_gamma_table_r[device_g]);
-		vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-		__m128 vec_b = _mm_load_ss(&transform->input_gamma_table_r[device_b]);
-		vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-		vec_r = _mm_mul_ps(vec_r, xmm1);
-		vec_g = _mm_mul_ps(vec_g, xmm2);
-		vec_b = _mm_mul_ps(vec_b, xmm3);
-
-		vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-
-		__m128 max = _mm_load_ss(&clampMax);
-		max = _mm_shuffle_ps(max, max, 0);
-		__m128 min = _mm_setzero_ps();
-
-		vec_r = _mm_max_ps(min, vec_r);
-		vec_r = _mm_min_ps(max, vec_r);
-
-		__m128 scale = _mm_load_ss(&floatScale);
-		scale = _mm_shuffle_ps(scale, scale, 0);
-		__m128 result = _mm_mul_ps(vec_r, scale);
-
-		__m128i out = _mm_cvtps_epi32(result);
-		_mm_store_si128((__m128i*)input, out);
-
-		*dest++ = transform->output_table_r->data[output[0]];
-		*dest++ = transform->output_table_g->data[output[1]];
-		*dest++ = transform->output_table_b->data[output[2]];
-	}
-}
-#endif
-
-#if defined(_MSC_VER) && defined(_M_AMD64)
-#include <emmintrin.h>
-#endif
-
-static void qcms_transform_data_rgb_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
-{
-	unsigned int i;
-	float (*mat)[4] = transform->matrix;
-        char input_back[32];
-	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
-	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-	 * because they don't work on stack variables. gcc 4.4 does do the right thing 
-	 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-        float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-        /* share input and output locations to save having to keep the
-         * locations in separate registers */
-        uint32_t* output = (uint32_t*)input;
-	for (i = 0; i < length; i++) {
-		const float *clampMax = &clampMaxValue;
-
-		unsigned char device_r = *src++;
-		unsigned char device_g = *src++;
-		unsigned char device_b = *src++;
-
-		input[0] = transform->input_gamma_table_r[device_r];
-		input[1] = transform->input_gamma_table_g[device_g];
-		input[2] = transform->input_gamma_table_b[device_b];
-
-#ifdef __GNUC__
-		__asm(
-                      "movaps (%0), %%xmm1;\n\t"          // Move the first matrix column to xmm1
-                      "movaps 16(%0), %%xmm2;\n\t"        // Move the second matrix column to xmm2
-                      "movaps 32(%0), %%xmm3;\n\t"        // move the third matrix column to xmm3
-                      "movaps (%3), %%xmm0;\n\t"        // Move the vector to xmm0
-
-                                                          // Note - We have to copy and then shuffle because of the weird
-                                                          // semantics of shufps
-                                                          //
-                      "movaps %%xmm0, %%xmm4;\n\t"        // Copy the vector to xmm4
-                      "shufps $0, %%xmm4, %%xmm4;\n\t"    // Shuffle to repeat the first vector element repeated 4 times
-                      "mulps %%xmm4, %%xmm1;\n\t"         // Multiply the first vector element by the first matrix column
-                      "movaps %%xmm0, %%xmm5; \n\t"       // Copy the vector to xmm5
-                      "shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
-                      "mulps %%xmm5, %%xmm2;\n\t"         // Multiply the second vector element by the seccond matrix column 
-                      "movaps %%xmm0, %%xmm6;\n\t"        // Copy the vector to xmm6
-                      "shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
-                      "mulps %%xmm6, %%xmm3;\n\t"         // Multiply the third vector element by the third matrix column
-
-                      "addps %%xmm3, %%xmm2;\n\t"         // Sum (second + third) columns
-                      "addps %%xmm2, %%xmm1;\n\t"         // Sum ((second + third) + first) columns
-
-                      "movss (%1), %%xmm7;\n\t"        // load the floating point representation of 65535/65536 
-                      "shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
-                      "minps %%xmm7, %%xmm1;\n\t"      // clamp the vector to 1.0 max
-                      "xorps %%xmm6, %%xmm6;\n\t"       // get us cleared bitpatern, which is 0.0f
-                      "maxps %%xmm6, %%xmm1;\n\t"      // clamp the vector to 0.0 min
-                      "movss (%2), %%xmm5;\n\t"        // load the floating point scale factor
-                      "shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
-                      "mulps %%xmm5, %%xmm1;\n\t"      // multiply by the scale factor
-                      "cvtps2dq %%xmm1, %%xmm1;\n\t"   // convert to integers
-                      "movdqa %%xmm1, (%3);\n\t"       // store
-
-                      : 
-                      : "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
-                      : "memory"
-/* older versions of gcc don't know about these registers so only include them as constraints
-   if gcc knows about them */
-#ifdef __SSE2__
-                        , "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
-#endif
-                      );
-#elif defined(_MSC_VER) && defined(_M_IX86)
-                __asm {
-                      mov      eax, mat
-                      mov      ecx, clampMax
-                      mov      edx, floatScaleAddr
-		      mov      ebx, input
-
-                      movaps   xmm1, [eax]
-                      movaps   xmm2, [eax + 16]
-                      movaps   xmm3, [eax + 32]
-                      movaps   xmm0, [ebx]
-
-                      movaps   xmm4, xmm0
-                      shufps   xmm4, xmm4, 0
-                      mulps    xmm1, xmm4
-                      movaps   xmm5, xmm0
-                      shufps   xmm5, xmm5, 0x55
-                      mulps    xmm2, xmm5
-                      movaps   xmm6, xmm0
-                      shufps   xmm6, xmm6, 0xAA
-                      mulps    xmm3, xmm6
-
-                      addps    xmm2, xmm3
-                      addps    xmm1, xmm2
-
-                      movss    xmm7, [ecx]
-                      shufps   xmm7, xmm7, 0
-                      minps    xmm1, xmm7
-                      xorps    xmm6, xmm6
-                      maxps    xmm1, xmm6
-                      movss    xmm5, [edx]
-                      shufps   xmm5, xmm5, 0
-                      mulps    xmm1, xmm5
-                      cvtps2dq xmm1, xmm1
-                      movdqa   [ebx], xmm1
-                }
-#elif defined(_MSC_VER) && defined(_M_AMD64)
-                {
-                        __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
-
-                        xmm1 = _mm_load_ps((__m128*)mat);
-                        xmm2 = _mm_load_ps(((__m128*)mat) + 1);
-                        xmm3 = _mm_load_ps(((__m128*)mat) + 2);
-                        xmm0 = _mm_load_ps((__m128*)input);
-
-                        xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
-                        xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
-                        xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
-
-                        xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
-
-                        xmm7 = _mm_load_ss(clampMax);
-                        xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
-                        xmm1 = _mm_min_ps(xmm1, xmm7);
-                        xmm6 = _mm_xor_ps(xmm6, xmm6);
-                        xmm1 = _mm_max_ps(xmm1, xmm6);
-                        xmm5 = _mm_load_ss(&floatScale);
-                        xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
-                        xmm1 = _mm_mul_ps(xmm1, xmm5);
-                        _mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
-                }
-#else
-#error "Unknown platform"
-#endif
-
-		*dest++ = transform->output_table_r->data[output[0]];
-		*dest++ = transform->output_table_g->data[output[1]];
-		*dest++ = transform->output_table_b->data[output[2]];
-	}
-}
-
-static void qcms_transform_data_rgba_out_lut_sse(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
-{
-	unsigned int i;
-	float (*mat)[4] = transform->matrix;
-        char input_back[32];
-	/* align input on 16 byte boundary */
-        float *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-        /* share input and output locations to save having to keep the
-         * locations in separate registers */
-        uint32_t* output = (uint32_t*)input;
-	for (i = 0; i < length; i++) {
-		const float *clampMax = &clampMaxValue;
-
-		unsigned char device_r = *src++;
-		unsigned char device_g = *src++;
-		unsigned char device_b = *src++;
-		unsigned char alpha = *src++;
-
-		input[0] = transform->input_gamma_table_r[device_r];
-		input[1] = transform->input_gamma_table_g[device_g];
-		input[2] = transform->input_gamma_table_b[device_b];
-
-#ifdef __GNUC__
-		__asm(
-                      "movaps (%0), %%xmm1;\n\t"          // Move the first matrix column to xmm1
-                      "movaps 16(%0), %%xmm2;\n\t"        // Move the second matrix column to xmm2
-                      "movaps 32(%0), %%xmm3;\n\t"        // move the third matrix column to xmm3
-                      "movaps (%3), %%xmm0;\n\t"        // Move the vector to xmm0
-
-                                                          // Note - We have to copy and then shuffle because of the weird
-                                                          // semantics of shufps
-                                                          //
-                      "movaps %%xmm0, %%xmm4;\n\t"        // Copy the vector to xmm4
-                      "shufps $0, %%xmm4, %%xmm4;\n\t"    // Shuffle to repeat the first vector element repeated 4 times
-                      "mulps %%xmm4, %%xmm1;\n\t"         // Multiply the first vector element by the first matrix column
-                      "movaps %%xmm0, %%xmm5; \n\t"       // Copy the vector to xmm5
-                      "shufps $0x55, %%xmm5, %%xmm5;\n\t" // Shuffle to repeat the second vector element repeated 4 times
-                      "mulps %%xmm5, %%xmm2;\n\t"         // Multiply the second vector element by the seccond matrix column 
-                      "movaps %%xmm0, %%xmm6;\n\t"        // Copy the vector to xmm6
-                      "shufps $0xAA, %%xmm6, %%xmm6;\n\t" // Shuffle to repeat the third vector element repeated 4 times
-                      "mulps %%xmm6, %%xmm3;\n\t"         // Multiply the third vector element by the third matrix column
-
-                      "addps %%xmm3, %%xmm2;\n\t"         // Sum (second + third) columns
-                      "addps %%xmm2, %%xmm1;\n\t"         // Sum ((second + third) + first) columns
-
-                      "movss (%1), %%xmm7;\n\t"        // load the floating point representation of 65535/65536 
-                      "shufps $0, %%xmm7, %%xmm7;\n\t" // move it into all of the four slots
-                      "minps %%xmm7, %%xmm1;\n\t"      // clamp the vector to 1.0 max
-                      "xorps %%xmm6, %%xmm6;\n\t"       // get us cleared bitpatern, which is 0.0f
-                      "maxps %%xmm6, %%xmm1;\n\t"      // clamp the vector to 0.0 min
-                      "movss (%2), %%xmm5;\n\t"        // load the floating point scale factor
-                      "shufps $0, %%xmm5, %%xmm5;\n\t" // put it in all four slots
-                      "mulps %%xmm5, %%xmm1;\n\t"      // multiply by the scale factor
-                      "cvtps2dq %%xmm1, %%xmm1;\n\t"   // convert to integers
-                      "movdqa %%xmm1, (%3);\n\t"       // store
-
-                      : 
-                      : "r" (mat), "r" (clampMax), "r" (&floatScale), "r" (input)
-                      : "memory"
-/* older versions of gcc don't know about these registers so only include them as constraints
-   if gcc knows about them */
-#ifdef __SSE2__
-                        , "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
-#endif
-                      );
-#elif defined(_MSC_VER) && defined(_M_IX86)
-                __asm {
-                      mov      eax, mat
-                      mov      ecx, clampMax
-                      mov      edx, floatScaleAddr
-		      mov      ebx, input
-
-                      movaps   xmm1, [eax]
-                      movaps   xmm2, [eax + 16]
-                      movaps   xmm3, [eax + 32]
-                      movaps   xmm0, [ebx]
-
-                      movaps   xmm4, xmm0
-                      shufps   xmm4, xmm4, 0
-                      mulps    xmm1, xmm4
-                      movaps   xmm5, xmm0
-                      shufps   xmm5, xmm5, 0x55
-                      mulps    xmm2, xmm5
-                      movaps   xmm6, xmm0
-                      shufps   xmm6, xmm6, 0xAA
-                      mulps    xmm3, xmm6
-
-                      addps    xmm2, xmm3
-                      addps    xmm1, xmm2
-
-                      movss    xmm7, [ecx]
-                      shufps   xmm7, xmm7, 0
-                      minps    xmm1, xmm7
-                      xorps    xmm6, xmm6
-                      maxps    xmm1, xmm6
-                      movss    xmm5, [edx]
-                      shufps   xmm5, xmm5, 0
-                      mulps    xmm1, xmm5
-                      cvtps2dq xmm1, xmm1
-                      movdqa   [ebx], xmm1
-                }
-#elif defined(_MSC_VER) && defined(_M_AMD64)
-                {
-                        __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7;
-
-                        xmm1 = _mm_load_ps((__m128*)mat);
-                        xmm2 = _mm_load_ps(((__m128*)mat) + 1);
-                        xmm3 = _mm_load_ps(((__m128*)mat) + 2);
-                        xmm0 = _mm_load_ps((__m128*)input);
-
-                        xmm1 = _mm_mul_ps(xmm1, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0,0,0,0)));
-                        xmm2 = _mm_mul_ps(xmm2, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1)));
-                        xmm3 = _mm_mul_ps(xmm3, _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2)));
-
-                        xmm1 = _mm_add_ps(xmm1, _mm_add_ps(xmm2, xmm3));
-
-                        xmm7 = _mm_load_ss(clampMax);
-                        xmm7 = _mm_shuffle_ps(xmm7, xmm7, _MM_SHUFFLE(0,0,0,0));
-                        xmm1 = _mm_min_ps(xmm1, xmm7);
-                        xmm6 = _mm_xor_ps(xmm6, xmm6);
-                        xmm1 = _mm_max_ps(xmm1, xmm6);
-                        xmm5 = _mm_load_ss(&floatScale);
-                        xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(0,0,0,0));
-                        xmm1 = _mm_mul_ps(xmm1, xmm5);
-                        _mm_store_si128((__m128i*)input, _mm_cvtps_epi32(xmm1));
-                }
-#else
-#error "Unknown platform"
-#endif
-
-		*dest++ = transform->output_table_r->data[output[0]];
-		*dest++ = transform->output_table_g->data[output[1]];
-		*dest++ = transform->output_table_b->data[output[2]];
-		*dest++ = alpha;
-	}
-}
-#endif
-
 static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 {
 	unsigned int i;
@@ -1380,7 +1035,7 @@
 	return true;
 }
 
-
+#ifdef X86
 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
 // mozilla/jpeg)
  // -------------------------------------------------------------------------
@@ -1423,31 +1078,43 @@
 }
 #endif
 
-// -------------------------Runtime SSE2 Detection-----------------------------
+// -------------------------Runtime SSEx Detection-----------------------------
 
+/* MMX is always supported per
+ *  Gecko v1.9.1 minimum CPU requirements */
+#define SSE1_EDX_MASK (1UL << 25)
 #define SSE2_EDX_MASK (1UL << 26)
-static qcms_bool sse2_available(void)
+#define SSE3_ECX_MASK (1UL <<  0)
+
+static int sse_version_available(void)
 {
 #if defined(__x86_64__) || defined(_M_AMD64)
-       return true;
+	/* we know at build time that 64-bit CPUs always have SSE2
+	 * this tells the compiler that non-SSE2 branches will never be
+	 * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
+	return 2;
 #elif defined(HAS_CPUID)
-       static int has_sse2 = -1;
-       uint32_t a, b, c, d;
-       uint32_t function = 0x00000001;
+	static int sse_version = -1;
+	uint32_t a, b, c, d;
+	uint32_t function = 0x00000001;
 
-       if (has_sse2 == -1) {
-              has_sse2 = 0;
-	      cpuid(function, &a, &b, &c, &d);
-              if (d & SSE2_EDX_MASK)
-                     has_sse2 = 1;
-              else
-                     has_sse2 = 0;
-       }
+	if (sse_version == -1) {
+		sse_version = 0;
+		cpuid(function, &a, &b, &c, &d);
+		if (c & SSE3_ECX_MASK)
+			sse_version = 3;
+		else if (d & SSE2_EDX_MASK)
+			sse_version = 2;
+		else if (d & SSE1_EDX_MASK)
+			sse_version = 1;
+	}
 
-       return has_sse2;
+	return sse_version;
+#else
+	return 0;
 #endif
-       return false;
 }
+#endif
 
 void build_output_lut(struct curveType *trc,
 		uint16_t **output_gamma_lut, size_t *output_gamma_lut_length)
@@ -1553,11 +1220,18 @@
             }
 	    if (precache) {
 #ifdef X86
-		    if (sse2_available()) {
+		    if (sse_version_available() >= 2) {
 			    if (in_type == QCMS_DATA_RGB_8)
-				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse;
+				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
 			    else
-				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse;
+				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
+
+		    } else
+		    if (sse_version_available() >= 1) {
+			    if (in_type == QCMS_DATA_RGB_8)
+				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
+			    else
+				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
 
 		    } else
 #endif
@@ -1639,6 +1313,10 @@
 	return transform;
 }
 
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+/* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
+__attribute__((__force_align_arg_pointer__))
+#endif
 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
 {
 	transform->transform_fn(transform, src, dest, length);