Reduce precision of output tables
diff --git a/qcmsint.h b/qcmsint.h
index b2e7bf6..217b651 100644
--- a/qcmsint.h
+++ b/qcmsint.h
@@ -1,13 +1,20 @@
 #include "qcms.h"
 #include "qcmstypes.h"
 
-/* used as a 16bit lookup table for the output transformation.
+/* used as a lookup table for the output transformation.
  * we refcount them so we only need to have one around per output
  * profile, instead of duplicating them per transform */
 struct precache_output
 {
 	int ref_count;
-	uint8_t data[65535];
+	/* We previously used a count of 65536 here but that seems like more
+	 * precision than we actually need.  By reducing the size we can
+	 * improve startup performance and reduce memory usage. ColorSync on
+	 * 10.5 uses 4097 which is perhaps because they use a fixed point
+	 * representation where 1. is represented by 0x1000. */
+#define PRECACHE_OUTPUT_SIZE 8192
+#define PRECACHE_OUTPUT_MAX (PRECACHE_OUTPUT_SIZE-1)
+	uint8_t data[PRECACHE_OUTPUT_SIZE];
 };
 
 #ifdef _MSC_VER
diff --git a/transform-sse1.c b/transform-sse1.c
index 59affa7..00707ca 100644
--- a/transform-sse1.c
+++ b/transform-sse1.c
@@ -3,8 +3,8 @@
 #include "qcmsint.h"
 
 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
-#define FLOATSCALE  65536.0f
-#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
+#define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
+#define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
 static const ALIGN float floatScaleX4[4] =
     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
 static const ALIGN float clampMaxValueX4[4] =
diff --git a/transform-sse2.c b/transform-sse2.c
index 208b4d9..f6e1674 100644
--- a/transform-sse2.c
+++ b/transform-sse2.c
@@ -3,8 +3,8 @@
 #include "qcmsint.h"
 
 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
-#define FLOATSCALE  65536.0f
-#define CLAMPMAXVAL ( ((float) (65536 - 1)) / 65536.0f )
+#define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
+#define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
 static const ALIGN float floatScaleX4[4] =
     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
 static const ALIGN float clampMaxValueX4[4] =
diff --git a/transform.c b/transform.c
index 027139a..d218aa1 100644
--- a/transform.c
+++ b/transform.c
@@ -38,7 +38,7 @@
 float lut_interp_linear(double value, uint16_t *table, int length)
 {
 	int upper, lower;
-	value = value * (length - 1);
+	value = value * (length - 1); // scale to length of the array
 	upper = ceil(value);
 	lower = floor(value);
 	//XXX: can we be more performant here?
@@ -50,16 +50,61 @@
 /* same as above but takes and returns a uint16_t value representing a range from 0..1 */
 uint16_t lut_interp_linear16(uint16_t input_value, uint16_t *table, int length)
 {
+	/* Start scaling input_value to the length of the array: 65535*(length-1).
+	 * We'll divide out the 65535 next */
 	uint32_t value = (input_value * (length - 1));
 	uint32_t upper = (value + 65534) / 65535; /* equivalent to ceil(value/65535) */
 	uint32_t lower = value / 65535;           /* equivalent to floor(value/65535) */
+	/* interp is the distance from upper to value scaled to 0..65535 */
 	uint32_t interp = value % 65535;
 
-	value = (table[upper]*(interp) + table[lower]*(65535 - interp))/65535;
+	value = (table[upper]*(interp) + table[lower]*(65535 - interp))/65535; // 0..65535*65535
 
 	return value;
 }
 
+/* same as above but takes an input_value from 0..PRECACHE_OUTPUT_MAX
+ * and returns a uint8_t value representing a range from 0..1 */
+static
+uint8_t lut_interp_linear_precache_output(uint32_t input_value, uint16_t *table, int length)
+{
+	/* Start scaling input_value to the length of the array: PRECACHE_OUTPUT_MAX*(length-1).
+	 * We'll divide out the PRECACHE_OUTPUT_MAX next */
+	uint32_t value = (input_value * (length - 1));
+
+	/* equivalent to ceil(value/PRECACHE_OUTPUT_MAX) */
+	uint32_t upper = (value + PRECACHE_OUTPUT_MAX-1) / PRECACHE_OUTPUT_MAX;
+	/* equivalent to floor(value/PRECACHE_OUTPUT_MAX) */
+	uint32_t lower = value / PRECACHE_OUTPUT_MAX;
+	/* interp is the distance from upper to value scaled to 0..PRECACHE_OUTPUT_MAX */
+	uint32_t interp = value % PRECACHE_OUTPUT_MAX;
+
+	/* the table values range from 0..65535 */
+	value = (table[upper]*(interp) + table[lower]*(PRECACHE_OUTPUT_MAX - interp)); // 0..(65535*PRECACHE_OUTPUT_MAX)
+
+	/* round and scale */
+	value += (PRECACHE_OUTPUT_MAX*65535/255)/2;
+        value /= (PRECACHE_OUTPUT_MAX*65535/255); // scale to 0..255
+	return value;
+}
+
+#if 0
+/* if we use a different representation i.e. one that goes from 0 to 0x1000 we can be more efficient
+ * because we can avoid the divisions and use a shifting instead */
+/* same as above but takes and returns a uint16_t value representing a range from 0..1 */
+uint16_t lut_interp_linear16(uint16_t input_value, uint16_t *table, int length)
+{
+	uint32_t value = (input_value * (length - 1));
+	uint32_t upper = (value + 4095) / 4096; /* equivalent to ceil(value/4096) */
+	uint32_t lower = value / 4096;           /* equivalent to floor(value/4096) */
+	uint32_t interp = value % 4096;
+
+	value = (table[upper]*(interp) + table[lower]*(4096 - interp))/4096; // 0..4096*4096
+
+	return value;
+}
+#endif
+
 void compute_curve_gamma_table_type1(float gamma_table[256], double gamma)
 {
 	unsigned int i;
@@ -707,7 +752,7 @@
 		float linear = transform->input_gamma_table_gray[device];
 
 		/* we could round here... */
-		gray = linear * 65535.;
+		gray = linear * PRECACHE_OUTPUT_MAX;
 
 		*dest++ = transform->output_table_r->data[gray];
 		*dest++ = transform->output_table_g->data[gray];
@@ -726,7 +771,7 @@
 		float linear = transform->input_gamma_table_gray[device];
 
 		/* we could round here... */
-		gray = linear * 65535.;
+		gray = linear * PRECACHE_OUTPUT_MAX;
 
 		*dest++ = transform->output_table_r->data[gray];
 		*dest++ = transform->output_table_g->data[gray];
@@ -758,9 +803,9 @@
 		out_linear_b = clamp_float(out_linear_b);
 
 		/* we could round here... */
-		r = out_linear_r * 65535.;
-		g = out_linear_g * 65535.;
-		b = out_linear_b * 65535.;
+		r = out_linear_r * PRECACHE_OUTPUT_MAX;
+		g = out_linear_g * PRECACHE_OUTPUT_MAX;
+		b = out_linear_b * PRECACHE_OUTPUT_MAX;
 
 		*dest++ = transform->output_table_r->data[r];
 		*dest++ = transform->output_table_g->data[g];
@@ -792,9 +837,9 @@
 		out_linear_b = clamp_float(out_linear_b);
 
 		/* we could round here... */
-		r = out_linear_r * 65535.;
-		g = out_linear_g * 65535.;
-		b = out_linear_b * 65535.;
+		r = out_linear_r * PRECACHE_OUTPUT_MAX;
+		g = out_linear_g * PRECACHE_OUTPUT_MAX;
+		b = out_linear_b * PRECACHE_OUTPUT_MAX;
 
 		*dest++ = transform->output_table_r->data[r];
 		*dest++ = transform->output_table_g->data[g];
@@ -988,27 +1033,26 @@
 static void compute_precache_pow(uint8_t *output, float gamma)
 {
 	uint32_t v = 0;
-	for (v = 0; v <= 0xffff; v++) {
+	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
 		//XXX: don't do integer/float conversion... and round?
-		output[v] = 255. * pow(v/65535., gamma);
+		output[v] = 255. * pow(v/(double)PRECACHE_OUTPUT_MAX, gamma);
 	}
 }
 
 void compute_precache_lut(uint8_t *output, uint16_t *table, int length)
 {
 	uint32_t v = 0;
-	for (v = 0; v <= 0xffff; v++) {
-		//XXX: don't do integer/float conversion... round?
-		output[v] = lut_interp_linear16(v, table, length) >> 8;
+	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
+		output[v] = lut_interp_linear_precache_output(v, table, length);
 	}
 }
 
 void compute_precache_linear(uint8_t *output)
 {
 	uint32_t v = 0;
-	for (v = 0; v <= 0xffff; v++) {
+	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
 		//XXX: round?
-		output[v] = v >> 8;
+		output[v] = v / (PRECACHE_OUTPUT_SIZE/256);
 	}
 }