src/broadcom/compiler/qpu_validate.c - third_party/mesa - Git at Google

 /*
  * Copyright © 2014 Broadcom
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */

 /**
  * @file
  *
  * Validates the QPU instruction sequence after register allocation and
  * scheduling.
  */

 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "v3d_compiler.h"
 #include "qpu/qpu_disasm.h"

 struct v3d_qpu_validate_state {
         struct v3d_compile *c;
         const struct v3d_qpu_instr *last;
         int ip;
         int last_sfu_write;
         int last_branch_ip;
         int last_thrsw_ip;
         int first_tlb_z_write;

         /* Set when we've found the last-THRSW signal, or if we were started
          * in single-segment mode.
          */
         bool last_thrsw_found;

         /* Set when we've found the THRSW after the last THRSW */
         bool thrend_found;

         int thrsw_count;

         bool rtop_hazard;
         bool rtop_valid;
 };

 static void
 fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
 {
         struct v3d_compile *c = state->c;

         fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);

         int dump_ip = 0;
         vir_for_each_inst_inorder(inst, c) {
                 v3d_qpu_dump(c->devinfo, &inst->qpu);

                 if (dump_ip++ == state->ip)
                         fprintf(stderr, " *** ERROR ***");

                 fprintf(stderr, "\n");
         }

         fprintf(stderr, "\n");
         abort();
 }

 static bool
 in_branch_delay_slots(struct v3d_qpu_validate_state *state)
 {
         return (state->ip - state->last_branch_ip) < 3;
 }

 static bool
 in_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
 {
         return (state->ip - state->last_thrsw_ip) < 3;
 }

 static bool
 qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
                         bool (*predicate)(enum v3d_qpu_waddr waddr))
 {
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
                 return false;

         if (inst->alu.add.op != V3D_QPU_A_NOP &&
             inst->alu.add.magic_write &&
             predicate(inst->alu.add.waddr))
                 return true;

         if (inst->alu.mul.op != V3D_QPU_M_NOP &&
             inst->alu.mul.magic_write &&
             predicate(inst->alu.mul.waddr))
                 return true;

         return false;
 }

 static void
 qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
 {
         const struct v3d_device_info *devinfo = state->c->devinfo;

         if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
                 state->first_tlb_z_write = state->ip;

         const struct v3d_qpu_instr *inst = &qinst->qpu;

         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
             state->first_tlb_z_write >= 0 &&
             state->ip > state->first_tlb_z_write &&
             inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
             inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
             inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
             inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
                 fail_instr(state, "Implicit branch MSF read after TLB Z write");
         }

         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
                 return;

         if (inst->alu.mul.op == V3D_QPU_M_MULTOP)
             state->rtop_valid = true;

         if (inst->alu.mul.op == V3D_QPU_M_UMUL24) {
             if (state->rtop_hazard)
                 fail_instr(state, "UMUL24 reads rtop from MULTOP but it got cleared by a previous THRSW");
             state->rtop_valid = false;
             state->rtop_hazard = false;
         }

         if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
             state->first_tlb_z_write >= 0 &&
             state->ip > state->first_tlb_z_write) {
                 fail_instr(state, "SETMSF after TLB Z write");
         }

         if (state->first_tlb_z_write >= 0 &&
             state->ip > state->first_tlb_z_write &&
             inst->alu.add.op == V3D_QPU_A_MSF) {
                 fail_instr(state, "MSF read after TLB Z write");
         }

         if (devinfo->ver < 71) {
                 if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
                     inst->sig.small_imm_d) {
                         fail_instr(state, "small imm a/c/d added after V3D 7.1");
                 }
         } else {
                 if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
                     !vir_is_add(qinst)) {
                         fail_instr(state, "small imm a/b used but no ADD inst");
                 }
                 if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
                     !vir_is_mul(qinst)) {
                         fail_instr(state, "small imm c/d used but no MUL inst");
                 }
                 if (inst->sig.small_imm_a + inst->sig.small_imm_b +
                     inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
                         fail_instr(state, "only one small immediate can be "
                                    "enabled per instruction");
                 }
         }

         /* LDVARY writes r5 two instructions later and LDUNIF writes
          * r5 one instruction later, which is illegal to have
          * together.
          */
         if (state->last && state->last->sig.ldvary &&
             (inst->sig.ldunif || inst->sig.ldunifa)) {
                 fail_instr(state, "LDUNIF after a LDVARY");
         }

         /* GFXH-1633 (fixed since V3D 4.2.14, which is Rpi4)
          *
          * FIXME: This would not check correctly for V3D 4.2 versions lower
          * than V3D 4.2.14, but that is not a real issue because the simulator
          * will still catch this, and we are not really targeting any such
          * versions anyway.
          */
         if (state->c->devinfo->ver < 42) {
                 bool last_reads_ldunif = (state->last && (state->last->sig.ldunif ||
                                                           state->last->sig.ldunifrf));
                 bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa ||
                                                            state->last->sig.ldunifarf));
                 bool reads_ldunif = inst->sig.ldunif || inst->sig.ldunifrf;
                 bool reads_ldunifa = inst->sig.ldunifa || inst->sig.ldunifarf;
                 if ((last_reads_ldunif && reads_ldunifa) ||
                     (last_reads_ldunifa && reads_ldunif)) {
                         fail_instr(state,
                                    "LDUNIF and LDUNIFA can't be next to each other");
                 }
         }

         int tmu_writes = 0;
         int sfu_writes = 0;
         int vpm_writes = 0;
         int tlb_writes = 0;
         int tsy_writes = 0;

         if (inst->alu.add.op != V3D_QPU_A_NOP) {
                 if (inst->alu.add.magic_write) {
                         if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
                                                        inst->alu.add.waddr)) {
                                 tmu_writes++;
                         }
                         if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
                                 sfu_writes++;
                         if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
                                 vpm_writes++;
                         if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
                                 tlb_writes++;
                         if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
                                 tsy_writes++;
                 }
         }

         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
                 if (inst->alu.mul.magic_write) {
                         if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
                                                        inst->alu.mul.waddr)) {
                                 tmu_writes++;
                         }
                         if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
                                 sfu_writes++;
                         if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
                                 vpm_writes++;
                         if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
                                 tlb_writes++;
                         if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
                                 tsy_writes++;
                 }
         }

         if (in_thrsw_delay_slots(state)) {
                 /* There's no way you want to start SFU during the THRSW delay
                  * slots, since the result would land in the other thread.
                  */
                 if (sfu_writes) {
                         fail_instr(state,
                                    "SFU write started during THRSW delay slots ");
                 }

                 if (inst->sig.ldvary) {
                         if (devinfo->ver == 42)
                                 fail_instr(state, "LDVARY during THRSW delay slots");
                         if (devinfo->ver >= 71 &&
                             state->ip - state->last_thrsw_ip == 2) {
                                 fail_instr(state, "LDVARY in 2nd THRSW delay slot");
                         }
                 }
         }

         (void)qpu_magic_waddr_matches; /* XXX */

         /* SFU r4 results come back two instructions later.  No doing
          * r4 read/writes or other SFU lookups until it's done.
          */
         if (state->ip - state->last_sfu_write < 2) {
                 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
                         fail_instr(state, "R4 read too soon after SFU");

                 if (v3d_qpu_writes_r4(devinfo, inst))
                         fail_instr(state, "R4 write too soon after SFU");

                 if (sfu_writes)
                         fail_instr(state, "SFU write too soon after SFU");
         }

         /* XXX: The docs say VPM can happen with the others, but the simulator
          * disagrees.
          */
         if (tmu_writes +
             sfu_writes +
             vpm_writes +
             tlb_writes +
             tsy_writes +
             (devinfo->ver == 42 ? inst->sig.ldtmu : 0) +
             inst->sig.ldtlb +
             inst->sig.ldvpm +
             inst->sig.ldtlbu > 1) {
                 fail_instr(state,
                            "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
         }

         if (sfu_writes)
                 state->last_sfu_write = state->ip;

         if (inst->sig.thrsw) {
                 if (in_branch_delay_slots(state))
                         fail_instr(state, "THRSW in a branch delay slot.");

                 if (state->last_thrsw_found)
                         state->thrend_found = true;

                 if (state->last_thrsw_ip == state->ip - 1) {
                         /* If it's the second THRSW in a row, then it's just a
                          * last-thrsw signal.
                          */
                         if (state->last_thrsw_found)
                                 fail_instr(state, "Two last-THRSW signals");
                         state->last_thrsw_found = true;
                 } else {
                         if (in_thrsw_delay_slots(state)) {
                                 fail_instr(state,
                                            "THRSW too close to another THRSW.");
                         }
                         state->thrsw_count++;
                         state->last_thrsw_ip = state->ip;
                 }
         }

         if (state->thrend_found &&
             state->last_thrsw_ip - state->ip <= 2 &&
             inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if ((inst->alu.add.op != V3D_QPU_A_NOP &&
                      !inst->alu.add.magic_write)) {
                         if (devinfo->ver == 42) {
                                 fail_instr(state, "RF write after THREND");
                         } else if (devinfo->ver >= 71) {
                                 if (state->last_thrsw_ip - state->ip == 0) {
                                         fail_instr(state,
                                                    "ADD RF write at THREND");
                                 }
                                 if (inst->alu.add.waddr == 2 ||
                                     inst->alu.add.waddr == 3) {
                                         fail_instr(state,
                                                    "RF2-3 write after THREND");
                                 }
                         }
                 }

                 if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
                      !inst->alu.mul.magic_write)) {
                         if (devinfo->ver == 42) {
                                 fail_instr(state, "RF write after THREND");
                         } else if (devinfo->ver >= 71) {
                                 if (state->last_thrsw_ip - state->ip == 0) {
                                         fail_instr(state,
                                                    "MUL RF write at THREND");
                                 }

                                 if (inst->alu.mul.waddr == 2 ||
                                     inst->alu.mul.waddr == 3) {
                                         fail_instr(state,
                                                    "RF2-3 write after THREND");
                                 }
                         }
                 }

                 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
                     !inst->sig_magic) {
                         if (devinfo->ver == 42) {
                                 fail_instr(state, "RF write after THREND");
                         } else if (devinfo->ver >= 71 &&
                                    (inst->sig_addr == 2 ||
                                     inst->sig_addr == 3)) {
                                 fail_instr(state, "RF2-3 write after THREND");
                         }
                 }

                 /* GFXH-1625: No TMUWT in the last instruction */
                 if (state->last_thrsw_ip - state->ip == 2 &&
                     inst->alu.add.op == V3D_QPU_A_TMUWT)
                         fail_instr(state, "TMUWT in last instruction");
         }

         if (state->rtop_valid && state->ip == state->last_thrsw_ip + 2) {
                 state->rtop_hazard = true;
                 state->rtop_valid = false;
         }

         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
                 if (in_branch_delay_slots(state))
                         fail_instr(state, "branch in a branch delay slot.");
                 if (in_thrsw_delay_slots(state))
                         fail_instr(state, "branch in a THRSW delay slot.");
                 state->last_branch_ip = state->ip;
         }
 }

 static void
 qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block)
 {
         vir_for_each_inst(qinst, block) {
                 qpu_validate_inst(state, qinst);

                 state->last = &qinst->qpu;
                 state->ip++;
         }
 }

 /**
  * Checks for the instruction restrictions from page 37 ("Summary of
  * Instruction Restrictions").
  */
 void
 qpu_validate(struct v3d_compile *c)
 {
         /* We don't want to do validation in release builds, but we want to
          * keep compiling the validation code to make sure it doesn't get
          * broken.
          */
 #if !MESA_DEBUG
         return;
 #endif

         struct v3d_qpu_validate_state state = {
                 .c = c,
                 .last_sfu_write = -10,
                 .last_thrsw_ip = -10,
                 .last_branch_ip = -10,
                 .first_tlb_z_write = INT_MAX,
                 .ip = 0,

                 .last_thrsw_found = !c->last_thrsw,
                 .rtop_hazard = false,
                 .rtop_valid = false,
         };

         vir_for_each_block(block, c) {
                 qpu_validate_block(&state, block);
         }

         if (state.thrsw_count > 1 && !state.last_thrsw_found) {
                 fail_instr(&state,
                            "thread switch found without last-THRSW in program");
         }

         if (!state.thrend_found)
                 fail_instr(&state, "No program-end THRSW found");
 }
	/*
	* Copyright © 2014 Broadcom
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*/

	/**
	* @file
	*
	* Validates the QPU instruction sequence after register allocation and
	* scheduling.
	*/

	#include <assert.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include "v3d_compiler.h"
	#include "qpu/qpu_disasm.h"

	struct v3d_qpu_validate_state {
	struct v3d_compile *c;
	const struct v3d_qpu_instr *last;
	int ip;
	int last_sfu_write;
	int last_branch_ip;
	int last_thrsw_ip;
	int first_tlb_z_write;

	/* Set when we've found the last-THRSW signal, or if we were started
	* in single-segment mode.
	*/
	bool last_thrsw_found;

	/* Set when we've found the THRSW after the last THRSW */
	bool thrend_found;

	int thrsw_count;

	bool rtop_hazard;
	bool rtop_valid;
	};

	static void
	fail_instr(struct v3d_qpu_validate_state state, const char msg)
	{
	struct v3d_compile *c = state->c;

	fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);

	int dump_ip = 0;
	vir_for_each_inst_inorder(inst, c) {
	v3d_qpu_dump(c->devinfo, &inst->qpu);

	if (dump_ip++ == state->ip)
	fprintf(stderr, " * ERROR *");

	fprintf(stderr, "\n");
	}

	fprintf(stderr, "\n");
	abort();
	}

	static bool
	in_branch_delay_slots(struct v3d_qpu_validate_state *state)
	{
	return (state->ip - state->last_branch_ip) < 3;
	}

	static bool
	in_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
	{
	return (state->ip - state->last_thrsw_ip) < 3;
	}

	static bool
	qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
	bool (*predicate)(enum v3d_qpu_waddr waddr))
	{
	if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
	return false;

	if (inst->alu.add.op != V3D_QPU_A_NOP &&
	inst->alu.add.magic_write &&
	predicate(inst->alu.add.waddr))
	return true;

	if (inst->alu.mul.op != V3D_QPU_M_NOP &&
	inst->alu.mul.magic_write &&
	predicate(inst->alu.mul.waddr))
	return true;

	return false;
	}

	static void
	qpu_validate_inst(struct v3d_qpu_validate_state state, struct qinst qinst)
	{
	const struct v3d_device_info *devinfo = state->c->devinfo;

	if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
	state->first_tlb_z_write = state->ip;

	const struct v3d_qpu_instr *inst = &qinst->qpu;

	if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
	state->first_tlb_z_write >= 0 &&
	state->ip > state->first_tlb_z_write &&
	inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
	inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
	inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
	inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
	fail_instr(state, "Implicit branch MSF read after TLB Z write");
	}

	if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
	return;

	if (inst->alu.mul.op == V3D_QPU_M_MULTOP)
	state->rtop_valid = true;

	if (inst->alu.mul.op == V3D_QPU_M_UMUL24) {
	if (state->rtop_hazard)
	fail_instr(state, "UMUL24 reads rtop from MULTOP but it got cleared by a previous THRSW");
	state->rtop_valid = false;
	state->rtop_hazard = false;
	}

	if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
	state->first_tlb_z_write >= 0 &&
	state->ip > state->first_tlb_z_write) {
	fail_instr(state, "SETMSF after TLB Z write");
	}

	if (state->first_tlb_z_write >= 0 &&
	state->ip > state->first_tlb_z_write &&
	inst->alu.add.op == V3D_QPU_A_MSF) {
	fail_instr(state, "MSF read after TLB Z write");
	}

	if (devinfo->ver < 71) {
	if (inst->sig.small_imm_a \|\| inst->sig.small_imm_c \|\|
	inst->sig.small_imm_d) {
	fail_instr(state, "small imm a/c/d added after V3D 7.1");
	}
	} else {
	if ((inst->sig.small_imm_a \|\| inst->sig.small_imm_b) &&
	!vir_is_add(qinst)) {
	fail_instr(state, "small imm a/b used but no ADD inst");
	}
	if ((inst->sig.small_imm_c \|\| inst->sig.small_imm_d) &&
	!vir_is_mul(qinst)) {
	fail_instr(state, "small imm c/d used but no MUL inst");
	}
	if (inst->sig.small_imm_a + inst->sig.small_imm_b +
	inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
	fail_instr(state, "only one small immediate can be "
	"enabled per instruction");
	}
	}

	/* LDVARY writes r5 two instructions later and LDUNIF writes
	* r5 one instruction later, which is illegal to have
	* together.
	*/
	if (state->last && state->last->sig.ldvary &&
	(inst->sig.ldunif \|\| inst->sig.ldunifa)) {
	fail_instr(state, "LDUNIF after a LDVARY");
	}

	/* GFXH-1633 (fixed since V3D 4.2.14, which is Rpi4)
	*
	* FIXME: This would not check correctly for V3D 4.2 versions lower
	* than V3D 4.2.14, but that is not a real issue because the simulator
	* will still catch this, and we are not really targeting any such
	* versions anyway.
	*/
	if (state->c->devinfo->ver < 42) {
	bool last_reads_ldunif = (state->last && (state->last->sig.ldunif \|\|
	state->last->sig.ldunifrf));
	bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa \|\|
	state->last->sig.ldunifarf));
	bool reads_ldunif = inst->sig.ldunif \|\| inst->sig.ldunifrf;
	bool reads_ldunifa = inst->sig.ldunifa \|\| inst->sig.ldunifarf;
	if ((last_reads_ldunif && reads_ldunifa) \|\|
	(last_reads_ldunifa && reads_ldunif)) {
	fail_instr(state,
	"LDUNIF and LDUNIFA can't be next to each other");
	}
	}

	int tmu_writes = 0;
	int sfu_writes = 0;
	int vpm_writes = 0;
	int tlb_writes = 0;
	int tsy_writes = 0;

	if (inst->alu.add.op != V3D_QPU_A_NOP) {
	if (inst->alu.add.magic_write) {
	if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
	inst->alu.add.waddr)) {
	tmu_writes++;
	}
	if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
	sfu_writes++;
	if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
	vpm_writes++;
	if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
	tlb_writes++;
	if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
	tsy_writes++;
	}
	}

	if (inst->alu.mul.op != V3D_QPU_M_NOP) {
	if (inst->alu.mul.magic_write) {
	if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo,
	inst->alu.mul.waddr)) {
	tmu_writes++;
	}
	if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
	sfu_writes++;
	if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
	vpm_writes++;
	if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
	tlb_writes++;
	if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
	tsy_writes++;
	}
	}

	if (in_thrsw_delay_slots(state)) {
	/* There's no way you want to start SFU during the THRSW delay
	* slots, since the result would land in the other thread.
	*/
	if (sfu_writes) {
	fail_instr(state,
	"SFU write started during THRSW delay slots ");
	}

	if (inst->sig.ldvary) {
	if (devinfo->ver == 42)
	fail_instr(state, "LDVARY during THRSW delay slots");
	if (devinfo->ver >= 71 &&
	state->ip - state->last_thrsw_ip == 2) {
	fail_instr(state, "LDVARY in 2nd THRSW delay slot");
	}
	}
	}

	(void)qpu_magic_waddr_matches; /* XXX */

	/* SFU r4 results come back two instructions later. No doing
	* r4 read/writes or other SFU lookups until it's done.
	*/
	if (state->ip - state->last_sfu_write < 2) {
	if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
	fail_instr(state, "R4 read too soon after SFU");

	if (v3d_qpu_writes_r4(devinfo, inst))
	fail_instr(state, "R4 write too soon after SFU");

	if (sfu_writes)
	fail_instr(state, "SFU write too soon after SFU");
	}

	/* XXX: The docs say VPM can happen with the others, but the simulator
	* disagrees.
	*/
	if (tmu_writes +
	sfu_writes +
	vpm_writes +
	tlb_writes +
	tsy_writes +
	(devinfo->ver == 42 ? inst->sig.ldtmu : 0) +
	inst->sig.ldtlb +
	inst->sig.ldvpm +
	inst->sig.ldtlbu > 1) {
	fail_instr(state,
	"Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
	}

	if (sfu_writes)
	state->last_sfu_write = state->ip;

	if (inst->sig.thrsw) {
	if (in_branch_delay_slots(state))
	fail_instr(state, "THRSW in a branch delay slot.");

	if (state->last_thrsw_found)
	state->thrend_found = true;

	if (state->last_thrsw_ip == state->ip - 1) {
	/* If it's the second THRSW in a row, then it's just a
	* last-thrsw signal.
	*/
	if (state->last_thrsw_found)
	fail_instr(state, "Two last-THRSW signals");
	state->last_thrsw_found = true;
	} else {
	if (in_thrsw_delay_slots(state)) {
	fail_instr(state,
	"THRSW too close to another THRSW.");
	}
	state->thrsw_count++;
	state->last_thrsw_ip = state->ip;
	}
	}

	if (state->thrend_found &&
	state->last_thrsw_ip - state->ip <= 2 &&
	inst->type == V3D_QPU_INSTR_TYPE_ALU) {
	if ((inst->alu.add.op != V3D_QPU_A_NOP &&
	!inst->alu.add.magic_write)) {
	if (devinfo->ver == 42) {
	fail_instr(state, "RF write after THREND");
	} else if (devinfo->ver >= 71) {
	if (state->last_thrsw_ip - state->ip == 0) {
	fail_instr(state,
	"ADD RF write at THREND");
	}
	if (inst->alu.add.waddr == 2 \|\|
	inst->alu.add.waddr == 3) {
	fail_instr(state,
	"RF2-3 write after THREND");
	}
	}
	}

	if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
	!inst->alu.mul.magic_write)) {
	if (devinfo->ver == 42) {
	fail_instr(state, "RF write after THREND");
	} else if (devinfo->ver >= 71) {
	if (state->last_thrsw_ip - state->ip == 0) {
	fail_instr(state,
	"MUL RF write at THREND");
	}

	if (inst->alu.mul.waddr == 2 \|\|
	inst->alu.mul.waddr == 3) {
	fail_instr(state,
	"RF2-3 write after THREND");
	}
	}
	}

	if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
	!inst->sig_magic) {
	if (devinfo->ver == 42) {
	fail_instr(state, "RF write after THREND");
	} else if (devinfo->ver >= 71 &&
	(inst->sig_addr == 2 \|\|
	inst->sig_addr == 3)) {
	fail_instr(state, "RF2-3 write after THREND");
	}
	}

	/* GFXH-1625: No TMUWT in the last instruction */
	if (state->last_thrsw_ip - state->ip == 2 &&
	inst->alu.add.op == V3D_QPU_A_TMUWT)
	fail_instr(state, "TMUWT in last instruction");
	}

	if (state->rtop_valid && state->ip == state->last_thrsw_ip + 2) {
	state->rtop_hazard = true;
	state->rtop_valid = false;
	}

	if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
	if (in_branch_delay_slots(state))
	fail_instr(state, "branch in a branch delay slot.");
	if (in_thrsw_delay_slots(state))
	fail_instr(state, "branch in a THRSW delay slot.");
	state->last_branch_ip = state->ip;
	}
	}

	static void
	qpu_validate_block(struct v3d_qpu_validate_state state, struct qblock block)
	{
	vir_for_each_inst(qinst, block) {
	qpu_validate_inst(state, qinst);

	state->last = &qinst->qpu;
	state->ip++;
	}
	}

	/**
	* Checks for the instruction restrictions from page 37 ("Summary of
	* Instruction Restrictions").
	*/
	void
	qpu_validate(struct v3d_compile *c)
	{
	/* We don't want to do validation in release builds, but we want to
	* keep compiling the validation code to make sure it doesn't get
	* broken.
	*/
	#if !MESA_DEBUG
	return;
	#endif

	struct v3d_qpu_validate_state state = {
	.c = c,
	.last_sfu_write = -10,
	.last_thrsw_ip = -10,
	.last_branch_ip = -10,
	.first_tlb_z_write = INT_MAX,
	.ip = 0,

	.last_thrsw_found = !c->last_thrsw,
	.rtop_hazard = false,
	.rtop_valid = false,
	};

	vir_for_each_block(block, c) {
	qpu_validate_block(&state, block);
	}

	if (state.thrsw_count > 1 && !state.last_thrsw_found) {
	fail_instr(&state,
	"thread switch found without last-THRSW in program");
	}

	if (!state.thrend_found)
	fail_instr(&state, "No program-end THRSW found");
	}