performance
diff --git a/Makefile b/Makefile
index 2aecdfb..c8d914f 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@
 QCMS_SRC=iccread.c transform.c matrix.c chain.c transform-util.c transform-sse2.c transform-sse1.c
 QCMS_OBJS=iccread.o transform.o matrix.o chain.o transform-util.o transform-sse2.o transform-sse1.o
 
-PROGRAMS=profile-gen test test-invalid test-transform dump-profile div-test coverage malloc-fail invalid-coverage
+PROGRAMS=performance profile-gen test test-invalid test-transform dump-profile div-test coverage malloc-fail invalid-coverage
 
 # I don't know a good way to get the exit code of pkg-config into a make variable
 HAS_LCMS:=$(shell pkg-config --exists lcms; echo $$?)
diff --git a/performance.c b/performance.c
new file mode 100644
index 0000000..fb6591b
--- /dev/null
+++ b/performance.c
@@ -0,0 +1,64 @@
+#include <stdlib.h>
+#include <time.h>
+#include "sum.h"
+#include "qcms.h"
+
+int main(int argc, char **argv)
+{
+	char *input_path = argv[1];
+	char *output_path = argv[2];
+	
+	qcms_profile *input_profile, *output_profile;
+	qcms_transform *transform;
+#define ALL
+#ifndef ALL
+#define LENGTH 1
+#else
+#define LENGTH (256*256*256)
+#endif
+	static unsigned char src[LENGTH*3];
+	static unsigned char qoutput[LENGTH*3];
+	static unsigned char loutput[LENGTH*3];
+#ifdef ALL
+	int i,j,k,l=0;
+	for (i=0; i<256; i++) {
+		for (j=0; j<256; j++) {
+			for (k=0; k<256; k++) {
+				src[l++] = i;
+				src[l++] = j;
+				src[l++] = k;
+			}
+		}
+	}
+#else
+	int i;
+	src[0] = 19;
+	src[1] = 28;
+	src[2] = 56;
+#endif
+	clock_t qcms_start = clock();
+	input_profile = qcms_profile_from_path(input_path);
+	output_profile = qcms_profile_from_path(output_path);
+	qcms_profile_precache_output_transform(output_profile);
+
+	transform = qcms_transform_create(input_profile, QCMS_DATA_RGB_8, output_profile, QCMS_DATA_RGB_8, QCMS_INTENT_PERCEPTUAL);
+	qcms_transform_data(transform, src, qoutput, LENGTH);
+	clock_t qcms_time = clock() - qcms_start;
+	printf("qcms: %ld\n", qcms_time);
+	int total_diff = 0;
+	int diff_sum = 0;
+	for (i=0; i<LENGTH; i++) {
+		int diff = 0;
+		diff_sum += (loutput[i*3]-qoutput[i*3]);
+		diff_sum += (loutput[i*3+1]-qoutput[i*3+1]);
+		diff_sum += (loutput[i*3+2]-qoutput[i*3+2]);
+		diff += abs(loutput[i*3]-qoutput[i*3]);
+		diff += abs(loutput[i*3+1]-qoutput[i*3+1]);
+		diff += abs(loutput[i*3+2]-qoutput[i*3+2]);
+		total_diff += diff;
+	}
+	printf("%d %d - %f\n", diff_sum, total_diff, (double)total_diff/LENGTH);
+	qcms_profile_release(input_profile);
+	qcms_profile_release(output_profile);
+	return 0;
+}
diff --git a/transform.c b/transform.c
index f55e074..40a7491 100644
--- a/transform.c
+++ b/transform.c
@@ -988,6 +988,7 @@
 
 static int sse_version_available(void)
 {
+	return 0;
 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
 	/* we know at build time that 64-bit CPUs always have SSE2
 	 * this tells the compiler that non-SSE2 branches will never be