Always test all four components of output

More thorough testing.

Signed-off-by: Wladimir J. van der Laan <laanwj@gmail.com>
diff --git a/src/etnaviv_verifyops.c b/src/etnaviv_verifyops.c
index 77fd470..f00ef94 100644
--- a/src/etnaviv_verifyops.c
+++ b/src/etnaviv_verifyops.c
@@ -211,6 +211,17 @@
     }
 }
 
+void i32_generate_values_h4(size_t seed, void *a, size_t width)
+{
+    uint32_t base = seed * width;
+    for (size_t x=0; x<width; ++x) {
+        ((uint32_t*)a)[x*4+0] = base + x;
+        ((uint32_t*)a)[x*4+1] = base + x + 0x010000;
+        ((uint32_t*)a)[x*4+2] = base + x + 0x020000;
+        ((uint32_t*)a)[x*4+3] = base + x + 0x030000;
+    }
+}
+
 void i32_generate_values_v(size_t seed, void *b, size_t height)
 {
     uint32_t base = seed * height;
@@ -222,10 +233,25 @@
     }
 }
 
+void i32_generate_values_v4(size_t seed, void *b, size_t height)
+{
+    uint32_t base = seed * height;
+    for (size_t y=0; y<height; ++y) {
+        ((uint32_t*)b)[y*4+0] = base + y;
+        ((uint32_t*)b)[y*4+1] = base + y + 0x010000;
+        ((uint32_t*)b)[y*4+2] = base + y + 0x020000;
+        ((uint32_t*)b)[y*4+3] = base + y + 0x030000;
+    }
+}
+
+/* shortcut for source value 0 */
 #define A (a[x*4])
 #define B (b[y*4])
+/* source value (component) */
+#define AI(i) (a[x*4+(i)])
+#define BI(i) (b[y*4+(i)])
 /* Scalar computations broadcasted to all channels */
-#define CPU_COMPUTE_FUNC1(_name, _type, _expr) \
+#define CPU_COMPUTE_FUNC1_BCAST(_name, _type, _expr) \
     static void _name(_type *out, const _type *a, const _type *b, size_t width, size_t height) \
     { \
         for(size_t y=0; y<height; ++y) { \
@@ -234,6 +260,19 @@
             } \
         } \
     }
+/* Scalar computation on one channel only, rest will stay at padding pattern */
+#define CPU_COMPUTE_FUNC1_PAD(_name, _type, _expr) \
+    static void _name(_type *out, const _type *a, const _type *b, size_t width, size_t height) \
+    { \
+        for(size_t y=0; y<height; ++y) { \
+            for(size_t x=0; x<width; ++x) { \
+                out[(y*width+x)*4+0] = (_expr); \
+                out[(y*width+x)*4+1] = 0x55555555; \
+                out[(y*width+x)*4+2] = 0xaaaaaaaa; \
+                out[(y*width+x)*4+3] = 0x55555555; \
+            } \
+        } \
+    }
 /* Independent expressions for channels */
 #define CPU_COMPUTE_FUNC4(_name, _type, _expr0, _expr1, _expr2, _expr3) \
     static void _name(_type *out, const _type *a, const _type *b, size_t width, size_t height) \
@@ -249,24 +288,27 @@
     }
 CPU_COMPUTE_FUNC4(nop_compute_cpu, uint32_t, 0xaaaaaaaa, 0x55555555, 0xaaaaaaaa, 0x55555555);
 /* u32 */
-CPU_COMPUTE_FUNC1(addu32_compute_cpu, uint32_t, A + B);
-CPU_COMPUTE_FUNC1(mulu32_compute_cpu, uint32_t, A * B);
-CPU_COMPUTE_FUNC1(mulhu32_compute_cpu, uint32_t, ((uint64_t)A * (uint64_t)B)>>32);
-CPU_COMPUTE_FUNC1(madu32_compute_cpu, uint32_t, A * B + 0x12345678);
-CPU_COMPUTE_FUNC1(lshiftu32_compute_cpu, uint32_t, A << (B&31));
-CPU_COMPUTE_FUNC1(rshiftu32_compute_cpu, uint32_t, A >> (B&31));
-CPU_COMPUTE_FUNC1(rotateu32_compute_cpu, uint32_t, (A << (B&31)) | (A >> ((32-B)&31)));
-CPU_COMPUTE_FUNC1(oru32_compute_cpu, uint32_t, A | B);
-CPU_COMPUTE_FUNC1(andu32_compute_cpu, uint32_t, A & B);
-CPU_COMPUTE_FUNC1(xoru32_compute_cpu, uint32_t, A ^ B);
-CPU_COMPUTE_FUNC1(notu32_compute_cpu, uint32_t, ~A);
-CPU_COMPUTE_FUNC1(leadzerou32_compute_cpu, uint32_t, (A != 0) ? __builtin_clz(A) : 0x20);
+CPU_COMPUTE_FUNC1_PAD(addu32_single_compute_cpu, uint32_t, A + B);
+CPU_COMPUTE_FUNC1_BCAST(addu32_compute_cpu, uint32_t, A + B);
+CPU_COMPUTE_FUNC1_BCAST(mulu32_compute_cpu, uint32_t, A * B);
+CPU_COMPUTE_FUNC1_BCAST(mulhu32_compute_cpu, uint32_t, ((uint64_t)A * (uint64_t)B)>>32);
+CPU_COMPUTE_FUNC1_BCAST(madu32_compute_cpu, uint32_t, A * B + 0x12345678);
+CPU_COMPUTE_FUNC1_BCAST(lshiftu32_compute_cpu, uint32_t, A << (B&31));
+CPU_COMPUTE_FUNC1_BCAST(rshiftu32_compute_cpu, uint32_t, A >> (B&31));
+CPU_COMPUTE_FUNC1_BCAST(rotateu32_compute_cpu, uint32_t, (A << (B&31)) | (A >> ((32-B)&31)));
+CPU_COMPUTE_FUNC1_BCAST(oru32_compute_cpu, uint32_t, A | B);
+CPU_COMPUTE_FUNC1_BCAST(andu32_compute_cpu, uint32_t, A & B);
+CPU_COMPUTE_FUNC1_BCAST(xoru32_compute_cpu, uint32_t, A ^ B);
+CPU_COMPUTE_FUNC1_BCAST(notu32_compute_cpu, uint32_t, ~A);
+CPU_COMPUTE_FUNC1_BCAST(leadzerou32_compute_cpu, uint32_t, (A != 0) ? __builtin_clz(A) : 0x20);
 /* float */
-CPU_COMPUTE_FUNC1(addf32_compute_cpu, float, A + B);
-CPU_COMPUTE_FUNC1(mulf32_compute_cpu, float, A * B);
+CPU_COMPUTE_FUNC4(addf32_compute_cpu, float, AI(0) + BI(0), AI(1) + BI(1), AI(2) + BI(2), AI(3) + BI(3));
+CPU_COMPUTE_FUNC4(mulf32_compute_cpu, float, AI(0) * BI(0), AI(1) * BI(1), AI(2) * BI(2), AI(3) * BI(3));
 
 #undef A
 #undef B
+#undef AI
+#undef BI
 #undef CPU_COMPUTE
 
 /* Tests GPU code must take from a[x] t2 and b[y] t3, and output to t4.
@@ -279,7 +321,7 @@
         }))
     },
     /* Add will only output one element at a time */
-    {"add.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)addu32_compute_cpu,
+    {"add.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)addu32_single_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x00841001, 0x00202800, 0x80000000, 0x00000038, /* add.u32       t4, t2, void, t3 */
         }))
@@ -351,7 +393,7 @@
     // add.u16 does nothing
     // 0x00801001, 0x15402800, 0xc0000000, 0x00000018, /* add.u16       t0.x___, t2.yyyy, void, t1.xxxx */
     // Need an effective way of comparing these
-    {"add.f32", 1, CT_FLOAT32, i32_generate_values_h, i32_generate_values_v, (void*)addf32_compute_cpu,
+    {"add.f32", 4, CT_FLOAT32, i32_generate_values_h4, i32_generate_values_v4, (void*)addf32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x07841001, 0x39002800, 0x00000000, 0x00390038, /* add           t4, t2, void, t3 */
         }))