Found out a peculiarity with the add instruction

`add.u32` doesn't broadcast its output to all enabled components:
it only sets the first enabled one.
diff --git a/src/etnaviv_verifyops.c b/src/etnaviv_verifyops.c
index 01d3ed1..144b705 100644
--- a/src/etnaviv_verifyops.c
+++ b/src/etnaviv_verifyops.c
@@ -140,7 +140,10 @@
 {
     uint32_t base = seed * width;
     for (size_t x=0; x<width; ++x) {
-        ((uint32_t*)a)[x*4] = base + x;
+        ((uint32_t*)a)[x*4+0] = base + x;
+        ((uint32_t*)a)[x*4+1] = 0x51515151; /* fill other vector elements with recognizable random pattern */
+        ((uint32_t*)a)[x*4+2] = 0x15151515;
+        ((uint32_t*)a)[x*4+3] = 0x36363636;
     }
 }
 
@@ -148,22 +151,26 @@
 {
     uint32_t base = seed * height;
     for (size_t y=0; y<height; ++y) {
-        ((uint32_t*)b)[y*4] = base + y;
+        ((uint32_t*)b)[y*4+0] = base + y;
+        ((uint32_t*)b)[y*4+1] = 0x82828282; /* fill other vector elements with recognizable random pattern */
+        ((uint32_t*)b)[y*4+2] = 0x48484848;
+        ((uint32_t*)b)[y*4+3] = 0x27272727;
     }
 }
 
-/** Scalar computations */
 #define A (a[x*4])
 #define B (b[y*4])
+/* Scalar computations broadcasted to all channels */
 #define CPU_COMPUTE_FUNC1(_name, _type, _expr) \
     static void _name(_type *out, const _type *a, const _type *b, size_t width, size_t height) \
     { \
         for(size_t y=0; y<height; ++y) { \
             for(size_t x=0; x<width; ++x) { \
-                out[(y*width+x)*4] = (_expr); \
+                out[(y*width+x)*4+0] = out[(y*width+x)*4+1] = out[(y*width+x)*4+2] = out[(y*width+x)*4+3] = (_expr); \
             } \
         } \
     }
+/* Independent expressions for channels */
 #define CPU_COMPUTE_FUNC4(_name, _type, _expr0, _expr1, _expr2, _expr3) \
     static void _name(_type *out, const _type *a, const _type *b, size_t width, size_t height) \
     { \
@@ -202,47 +209,56 @@
             0x00000000, 0x00000000, 0x00000000, 0x00000000, /* nop */
         }))
     },
+    /* Add will only output one element at a time */
     {"add.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)addu32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x00841001, 0x00202800, 0x80000000, 0x00000038, /* add.u32       t4, t2, void, t3 */
         }))
     },
-    {"imullo0.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)mulu32_compute_cpu,
+    {"add4.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)addu32_compute_cpu,
+        GPU_CODE(((uint32_t[]){
+            0x00841001, 0x00202800, 0x80000000, 0x00000038, /* add.u32	t4.x___, t2.xxxx, void, t3.xxxx */
+            0x01041001, 0x00202800, 0x80000000, 0x00000038, /* add.u32	t4._y__, t2.xxxx, void, t3.xxxx */
+            0x02041001, 0x00202800, 0x80000000, 0x00000038, /* add.u32	t4.__z_, t2.xxxx, void, t3.xxxx */
+            0x04041001, 0x00202800, 0x80000000, 0x00000038, /* add.u32	t4.___w, t2.xxxx, void, t3.xxxx */
+        }))
+    },
+    {"imullo0.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)mulu32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x0784103c, 0x39202800, 0x81c801c0, 0x00000000, /* imullo0.u32   t4, t2, t3, void */
         }))
     },
-    {"lshift.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)lshiftu32_compute_cpu,
+    {"lshift.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)lshiftu32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x07841019, 0x39202800, 0x80010000, 0x00390038, /* lshift.u32    t4, t2, void, t3 */
         }))
     },
-    {"rshift.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)rshiftu32_compute_cpu,
+    {"rshift.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)rshiftu32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x0784101a, 0x39202800, 0x80010000, 0x00390038, /* rshift.u32    t4, t2, void, t3 */
         }))
     },
-    {"rotate.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)rotateu32_compute_cpu,
+    {"rotate.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)rotateu32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x0784101b, 0x39202800, 0x80010000, 0x00390038, /* rotate.u32    t4, t2, void, t3 */
         }))
     },
-    {"or.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)oru32_compute_cpu,
+    {"or.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)oru32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x0784101c, 0x39202800, 0x80010000, 0x00390038, /* or.u32        t4, t2, void, t3 */
         }))
     },
-    {"and.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)andu32_compute_cpu,
+    {"and.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)andu32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x0784101d, 0x39202800, 0x80010000, 0x00390038, /* and.u32       t4, t2, void, t3 */
         }))
     },
-    {"xor.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)xoru32_compute_cpu,
+    {"xor.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)xoru32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x0784101e, 0x39202800, 0x80010000, 0x00390038, /* xor.u32       t4, t2, void, t3 */
         }))
     },
-    {"not.u32", 1, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)notu32_compute_cpu,
+    {"not.u32", 4, CT_INT32, i32_generate_values_h, i32_generate_values_v, (void*)notu32_compute_cpu,
         GPU_CODE(((uint32_t[]){
             0x0784101f, 0x00200000, 0x80010000, 0x00390028, /* not.u32       t4, void, void, t2 */
         }))