| /* |
| * Copyright © 2014 Broadcom |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| /** |
| * @file |
| * |
| * Validates the QPU instruction sequence after register allocation and |
| * scheduling. |
| */ |
| |
| #include <assert.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include "v3d_compiler.h" |
| #include "qpu/qpu_disasm.h" |
| |
| struct v3d_qpu_validate_state { |
| struct v3d_compile *c; |
| const struct v3d_qpu_instr *last; |
| int ip; |
| int last_sfu_write; |
| int last_branch_ip; |
| int last_thrsw_ip; |
| int first_tlb_z_write; |
| |
| /* Set when we've found the last-THRSW signal, or if we were started |
| * in single-segment mode. |
| */ |
| bool last_thrsw_found; |
| |
| /* Set when we've found the THRSW after the last THRSW */ |
| bool thrend_found; |
| |
| int thrsw_count; |
| |
| bool rtop_hazard; |
| bool rtop_valid; |
| }; |
| |
| static void |
| fail_instr(struct v3d_qpu_validate_state *state, const char *msg) |
| { |
| struct v3d_compile *c = state->c; |
| |
| fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg); |
| |
| int dump_ip = 0; |
| vir_for_each_inst_inorder(inst, c) { |
| v3d_qpu_dump(c->devinfo, &inst->qpu); |
| |
| if (dump_ip++ == state->ip) |
| fprintf(stderr, " *** ERROR ***"); |
| |
| fprintf(stderr, "\n"); |
| } |
| |
| fprintf(stderr, "\n"); |
| abort(); |
| } |
| |
| static bool |
| in_branch_delay_slots(struct v3d_qpu_validate_state *state) |
| { |
| return (state->ip - state->last_branch_ip) < 3; |
| } |
| |
| static bool |
| in_thrsw_delay_slots(struct v3d_qpu_validate_state *state) |
| { |
| return (state->ip - state->last_thrsw_ip) < 3; |
| } |
| |
| static bool |
| qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst, |
| bool (*predicate)(enum v3d_qpu_waddr waddr)) |
| { |
| if (inst->type == V3D_QPU_INSTR_TYPE_ALU) |
| return false; |
| |
| if (inst->alu.add.op != V3D_QPU_A_NOP && |
| inst->alu.add.magic_write && |
| predicate(inst->alu.add.waddr)) |
| return true; |
| |
| if (inst->alu.mul.op != V3D_QPU_M_NOP && |
| inst->alu.mul.magic_write && |
| predicate(inst->alu.mul.waddr)) |
| return true; |
| |
| return false; |
| } |
| |
| static void |
| qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst) |
| { |
| const struct v3d_device_info *devinfo = state->c->devinfo; |
| |
| if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write) |
| state->first_tlb_z_write = state->ip; |
| |
| const struct v3d_qpu_instr *inst = &qinst->qpu; |
| |
| if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && |
| state->first_tlb_z_write >= 0 && |
| state->ip > state->first_tlb_z_write && |
| inst->branch.msfign != V3D_QPU_MSFIGN_NONE && |
| inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS && |
| inst->branch.cond != V3D_QPU_BRANCH_COND_A0 && |
| inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) { |
| fail_instr(state, "Implicit branch MSF read after TLB Z write"); |
| } |
| |
| if (inst->type != V3D_QPU_INSTR_TYPE_ALU) |
| return; |
| |
| if (inst->alu.mul.op == V3D_QPU_M_MULTOP) |
| state->rtop_valid = true; |
| |
| if (inst->alu.mul.op == V3D_QPU_M_UMUL24) { |
| if (state->rtop_hazard) |
| fail_instr(state, "UMUL24 reads rtop from MULTOP but it got cleared by a previous THRSW"); |
| state->rtop_valid = false; |
| state->rtop_hazard = false; |
| } |
| |
| if (inst->alu.add.op == V3D_QPU_A_SETMSF && |
| state->first_tlb_z_write >= 0 && |
| state->ip > state->first_tlb_z_write) { |
| fail_instr(state, "SETMSF after TLB Z write"); |
| } |
| |
| if (state->first_tlb_z_write >= 0 && |
| state->ip > state->first_tlb_z_write && |
| inst->alu.add.op == V3D_QPU_A_MSF) { |
| fail_instr(state, "MSF read after TLB Z write"); |
| } |
| |
| if (devinfo->ver < 71) { |
| if (inst->sig.small_imm_a || inst->sig.small_imm_c || |
| inst->sig.small_imm_d) { |
| fail_instr(state, "small imm a/c/d added after V3D 7.1"); |
| } |
| } else { |
| if ((inst->sig.small_imm_a || inst->sig.small_imm_b) && |
| !vir_is_add(qinst)) { |
| fail_instr(state, "small imm a/b used but no ADD inst"); |
| } |
| if ((inst->sig.small_imm_c || inst->sig.small_imm_d) && |
| !vir_is_mul(qinst)) { |
| fail_instr(state, "small imm c/d used but no MUL inst"); |
| } |
| if (inst->sig.small_imm_a + inst->sig.small_imm_b + |
| inst->sig.small_imm_c + inst->sig.small_imm_d > 1) { |
| fail_instr(state, "only one small immediate can be " |
| "enabled per instruction"); |
| } |
| } |
| |
| /* LDVARY writes r5 two instructions later and LDUNIF writes |
| * r5 one instruction later, which is illegal to have |
| * together. |
| */ |
| if (state->last && state->last->sig.ldvary && |
| (inst->sig.ldunif || inst->sig.ldunifa)) { |
| fail_instr(state, "LDUNIF after a LDVARY"); |
| } |
| |
| /* GFXH-1633 (fixed since V3D 4.2.14, which is Rpi4) |
| * |
| * FIXME: This would not check correctly for V3D 4.2 versions lower |
| * than V3D 4.2.14, but that is not a real issue because the simulator |
| * will still catch this, and we are not really targeting any such |
| * versions anyway. |
| */ |
| if (state->c->devinfo->ver < 42) { |
| bool last_reads_ldunif = (state->last && (state->last->sig.ldunif || |
| state->last->sig.ldunifrf)); |
| bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa || |
| state->last->sig.ldunifarf)); |
| bool reads_ldunif = inst->sig.ldunif || inst->sig.ldunifrf; |
| bool reads_ldunifa = inst->sig.ldunifa || inst->sig.ldunifarf; |
| if ((last_reads_ldunif && reads_ldunifa) || |
| (last_reads_ldunifa && reads_ldunif)) { |
| fail_instr(state, |
| "LDUNIF and LDUNIFA can't be next to each other"); |
| } |
| } |
| |
| int tmu_writes = 0; |
| int sfu_writes = 0; |
| int vpm_writes = 0; |
| int tlb_writes = 0; |
| int tsy_writes = 0; |
| |
| if (inst->alu.add.op != V3D_QPU_A_NOP) { |
| if (inst->alu.add.magic_write) { |
| if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo, |
| inst->alu.add.waddr)) { |
| tmu_writes++; |
| } |
| if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) |
| sfu_writes++; |
| if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr)) |
| vpm_writes++; |
| if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr)) |
| tlb_writes++; |
| if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) |
| tsy_writes++; |
| } |
| } |
| |
| if (inst->alu.mul.op != V3D_QPU_M_NOP) { |
| if (inst->alu.mul.magic_write) { |
| if (v3d_qpu_magic_waddr_is_tmu(state->c->devinfo, |
| inst->alu.mul.waddr)) { |
| tmu_writes++; |
| } |
| if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) |
| sfu_writes++; |
| if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr)) |
| vpm_writes++; |
| if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr)) |
| tlb_writes++; |
| if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr)) |
| tsy_writes++; |
| } |
| } |
| |
| if (in_thrsw_delay_slots(state)) { |
| /* There's no way you want to start SFU during the THRSW delay |
| * slots, since the result would land in the other thread. |
| */ |
| if (sfu_writes) { |
| fail_instr(state, |
| "SFU write started during THRSW delay slots "); |
| } |
| |
| if (inst->sig.ldvary) { |
| if (devinfo->ver == 42) |
| fail_instr(state, "LDVARY during THRSW delay slots"); |
| if (devinfo->ver >= 71 && |
| state->ip - state->last_thrsw_ip == 2) { |
| fail_instr(state, "LDVARY in 2nd THRSW delay slot"); |
| } |
| } |
| } |
| |
| (void)qpu_magic_waddr_matches; /* XXX */ |
| |
| /* SFU r4 results come back two instructions later. No doing |
| * r4 read/writes or other SFU lookups until it's done. |
| */ |
| if (state->ip - state->last_sfu_write < 2) { |
| if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4)) |
| fail_instr(state, "R4 read too soon after SFU"); |
| |
| if (v3d_qpu_writes_r4(devinfo, inst)) |
| fail_instr(state, "R4 write too soon after SFU"); |
| |
| if (sfu_writes) |
| fail_instr(state, "SFU write too soon after SFU"); |
| } |
| |
| /* XXX: The docs say VPM can happen with the others, but the simulator |
| * disagrees. |
| */ |
| if (tmu_writes + |
| sfu_writes + |
| vpm_writes + |
| tlb_writes + |
| tsy_writes + |
| (devinfo->ver == 42 ? inst->sig.ldtmu : 0) + |
| inst->sig.ldtlb + |
| inst->sig.ldvpm + |
| inst->sig.ldtlbu > 1) { |
| fail_instr(state, |
| "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed"); |
| } |
| |
| if (sfu_writes) |
| state->last_sfu_write = state->ip; |
| |
| if (inst->sig.thrsw) { |
| if (in_branch_delay_slots(state)) |
| fail_instr(state, "THRSW in a branch delay slot."); |
| |
| if (state->last_thrsw_found) |
| state->thrend_found = true; |
| |
| if (state->last_thrsw_ip == state->ip - 1) { |
| /* If it's the second THRSW in a row, then it's just a |
| * last-thrsw signal. |
| */ |
| if (state->last_thrsw_found) |
| fail_instr(state, "Two last-THRSW signals"); |
| state->last_thrsw_found = true; |
| } else { |
| if (in_thrsw_delay_slots(state)) { |
| fail_instr(state, |
| "THRSW too close to another THRSW."); |
| } |
| state->thrsw_count++; |
| state->last_thrsw_ip = state->ip; |
| } |
| } |
| |
| if (state->thrend_found && |
| state->last_thrsw_ip - state->ip <= 2 && |
| inst->type == V3D_QPU_INSTR_TYPE_ALU) { |
| if ((inst->alu.add.op != V3D_QPU_A_NOP && |
| !inst->alu.add.magic_write)) { |
| if (devinfo->ver == 42) { |
| fail_instr(state, "RF write after THREND"); |
| } else if (devinfo->ver >= 71) { |
| if (state->last_thrsw_ip - state->ip == 0) { |
| fail_instr(state, |
| "ADD RF write at THREND"); |
| } |
| if (inst->alu.add.waddr == 2 || |
| inst->alu.add.waddr == 3) { |
| fail_instr(state, |
| "RF2-3 write after THREND"); |
| } |
| } |
| } |
| |
| if ((inst->alu.mul.op != V3D_QPU_M_NOP && |
| !inst->alu.mul.magic_write)) { |
| if (devinfo->ver == 42) { |
| fail_instr(state, "RF write after THREND"); |
| } else if (devinfo->ver >= 71) { |
| if (state->last_thrsw_ip - state->ip == 0) { |
| fail_instr(state, |
| "MUL RF write at THREND"); |
| } |
| |
| if (inst->alu.mul.waddr == 2 || |
| inst->alu.mul.waddr == 3) { |
| fail_instr(state, |
| "RF2-3 write after THREND"); |
| } |
| } |
| } |
| |
| if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && |
| !inst->sig_magic) { |
| if (devinfo->ver == 42) { |
| fail_instr(state, "RF write after THREND"); |
| } else if (devinfo->ver >= 71 && |
| (inst->sig_addr == 2 || |
| inst->sig_addr == 3)) { |
| fail_instr(state, "RF2-3 write after THREND"); |
| } |
| } |
| |
| /* GFXH-1625: No TMUWT in the last instruction */ |
| if (state->last_thrsw_ip - state->ip == 2 && |
| inst->alu.add.op == V3D_QPU_A_TMUWT) |
| fail_instr(state, "TMUWT in last instruction"); |
| } |
| |
| if (state->rtop_valid && state->ip == state->last_thrsw_ip + 2) { |
| state->rtop_hazard = true; |
| state->rtop_valid = false; |
| } |
| |
| if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { |
| if (in_branch_delay_slots(state)) |
| fail_instr(state, "branch in a branch delay slot."); |
| if (in_thrsw_delay_slots(state)) |
| fail_instr(state, "branch in a THRSW delay slot."); |
| state->last_branch_ip = state->ip; |
| } |
| } |
| |
| static void |
| qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block) |
| { |
| vir_for_each_inst(qinst, block) { |
| qpu_validate_inst(state, qinst); |
| |
| state->last = &qinst->qpu; |
| state->ip++; |
| } |
| } |
| |
| /** |
| * Checks for the instruction restrictions from page 37 ("Summary of |
| * Instruction Restrictions"). |
| */ |
| void |
| qpu_validate(struct v3d_compile *c) |
| { |
| /* We don't want to do validation in release builds, but we want to |
| * keep compiling the validation code to make sure it doesn't get |
| * broken. |
| */ |
| #if !MESA_DEBUG |
| return; |
| #endif |
| |
| struct v3d_qpu_validate_state state = { |
| .c = c, |
| .last_sfu_write = -10, |
| .last_thrsw_ip = -10, |
| .last_branch_ip = -10, |
| .first_tlb_z_write = INT_MAX, |
| .ip = 0, |
| |
| .last_thrsw_found = !c->last_thrsw, |
| .rtop_hazard = false, |
| .rtop_valid = false, |
| }; |
| |
| vir_for_each_block(block, c) { |
| qpu_validate_block(&state, block); |
| } |
| |
| if (state.thrsw_count > 1 && !state.last_thrsw_found) { |
| fail_instr(&state, |
| "thread switch found without last-THRSW in program"); |
| } |
| |
| if (!state.thrend_found) |
| fail_instr(&state, "No program-end THRSW found"); |
| } |