| /* |
| * Copyright (C) 2021 Valve Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include "ir3.h" |
| |
| /* Lower several macro-instructions needed for shader subgroup support that |
| * must be turned into if statements. We do this after RA and post-RA |
| * scheduling to give the scheduler a chance to rearrange them, because RA |
| * may need to insert OPC_META_READ_FIRST to handle splitting live ranges, and |
| * also because some (e.g. BALLOT and READ_FIRST) must produce a shared |
| * register that cannot be spilled to a normal register until after the if, |
| * which makes implementing spilling more complicated if they are already |
| * lowered. |
| */ |
| |
| static void |
| replace_pred(struct ir3_block *block, struct ir3_block *old_pred, |
| struct ir3_block *new_pred) |
| { |
| for (unsigned i = 0; i < block->predecessors_count; i++) { |
| if (block->predecessors[i] == old_pred) { |
| block->predecessors[i] = new_pred; |
| return; |
| } |
| } |
| } |
| |
| static void |
| replace_physical_pred(struct ir3_block *block, struct ir3_block *old_pred, |
| struct ir3_block *new_pred) |
| { |
| for (unsigned i = 0; i < block->physical_predecessors_count; i++) { |
| if (block->physical_predecessors[i] == old_pred) { |
| block->physical_predecessors[i] = new_pred; |
| return; |
| } |
| } |
| } |
| |
| static void |
| mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed) |
| { |
| struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1); |
| struct ir3_register *mov_dst = ir3_dst_create(mov, dst->num, dst->flags); |
| mov_dst->wrmask = dst->wrmask; |
| struct ir3_register *src = ir3_src_create( |
| mov, INVALID_REG, (dst->flags & IR3_REG_HALF) | IR3_REG_IMMED); |
| src->uim_val = immed; |
| mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; |
| mov->cat1.src_type = mov->cat1.dst_type; |
| mov->repeat = util_last_bit(mov_dst->wrmask) - 1; |
| } |
| |
| static void |
| mov_reg(struct ir3_block *block, struct ir3_register *dst, |
| struct ir3_register *src) |
| { |
| struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1); |
| |
| struct ir3_register *mov_dst = |
| ir3_dst_create(mov, dst->num, dst->flags & (IR3_REG_HALF | IR3_REG_SHARED)); |
| struct ir3_register *mov_src = |
| ir3_src_create(mov, src->num, src->flags & (IR3_REG_HALF | IR3_REG_SHARED)); |
| mov_dst->wrmask = dst->wrmask; |
| mov_src->wrmask = src->wrmask; |
| mov->repeat = util_last_bit(mov_dst->wrmask) - 1; |
| |
| mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; |
| mov->cat1.src_type = (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; |
| } |
| |
| static void |
| binop(struct ir3_block *block, opc_t opc, struct ir3_register *dst, |
| struct ir3_register *src0, struct ir3_register *src1) |
| { |
| struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 2); |
| |
| unsigned flags = dst->flags & IR3_REG_HALF; |
| struct ir3_register *instr_dst = ir3_dst_create(instr, dst->num, flags); |
| struct ir3_register *instr_src0 = ir3_src_create(instr, src0->num, flags); |
| struct ir3_register *instr_src1 = ir3_src_create(instr, src1->num, flags); |
| |
| instr_dst->wrmask = dst->wrmask; |
| instr_src0->wrmask = src0->wrmask; |
| instr_src1->wrmask = src1->wrmask; |
| instr->repeat = util_last_bit(instr_dst->wrmask) - 1; |
| } |
| |
| static void |
| triop(struct ir3_block *block, opc_t opc, struct ir3_register *dst, |
| struct ir3_register *src0, struct ir3_register *src1, |
| struct ir3_register *src2) |
| { |
| struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 3); |
| |
| unsigned flags = dst->flags & IR3_REG_HALF; |
| struct ir3_register *instr_dst = ir3_dst_create(instr, dst->num, flags); |
| struct ir3_register *instr_src0 = ir3_src_create(instr, src0->num, flags); |
| struct ir3_register *instr_src1 = ir3_src_create(instr, src1->num, flags); |
| struct ir3_register *instr_src2 = ir3_src_create(instr, src2->num, flags); |
| |
| instr_dst->wrmask = dst->wrmask; |
| instr_src0->wrmask = src0->wrmask; |
| instr_src1->wrmask = src1->wrmask; |
| instr_src2->wrmask = src2->wrmask; |
| instr->repeat = util_last_bit(instr_dst->wrmask) - 1; |
| } |
| |
| static void |
| do_reduce(struct ir3_block *block, reduce_op_t opc, |
| struct ir3_register *dst, struct ir3_register *src0, |
| struct ir3_register *src1) |
| { |
| switch (opc) { |
| #define CASE(name) \ |
| case REDUCE_OP_##name: \ |
| binop(block, OPC_##name, dst, src0, src1); \ |
| break; |
| |
| CASE(ADD_U) |
| CASE(ADD_F) |
| CASE(MUL_F) |
| CASE(MIN_U) |
| CASE(MIN_S) |
| CASE(MIN_F) |
| CASE(MAX_U) |
| CASE(MAX_S) |
| CASE(MAX_F) |
| CASE(AND_B) |
| CASE(OR_B) |
| CASE(XOR_B) |
| |
| #undef CASE |
| |
| case REDUCE_OP_MUL_U: |
| if (dst->flags & IR3_REG_HALF) { |
| binop(block, OPC_MUL_S24, dst, src0, src1); |
| } else { |
| /* 32-bit multiplication macro - see ir3_nir_imul */ |
| binop(block, OPC_MULL_U, dst, src0, src1); |
| triop(block, OPC_MADSH_M16, dst, src0, src1, dst); |
| triop(block, OPC_MADSH_M16, dst, src1, src0, dst); |
| } |
| break; |
| } |
| } |
| |
| static struct ir3_block * |
| split_block(struct ir3 *ir, struct ir3_block *before_block, |
| struct ir3_instruction *instr) |
| { |
| struct ir3_block *after_block = ir3_block_create(ir); |
| list_add(&after_block->node, &before_block->node); |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) { |
| after_block->successors[i] = before_block->successors[i]; |
| if (after_block->successors[i]) |
| replace_pred(after_block->successors[i], before_block, after_block); |
| } |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(before_block->physical_successors); |
| i++) { |
| after_block->physical_successors[i] = |
| before_block->physical_successors[i]; |
| if (after_block->physical_successors[i]) { |
| replace_physical_pred(after_block->physical_successors[i], |
| before_block, after_block); |
| } |
| } |
| |
| before_block->successors[0] = before_block->successors[1] = NULL; |
| before_block->physical_successors[0] = before_block->physical_successors[1] = NULL; |
| |
| foreach_instr_from_safe (rem_instr, &instr->node, |
| &before_block->instr_list) { |
| list_del(&rem_instr->node); |
| list_addtail(&rem_instr->node, &after_block->instr_list); |
| rem_instr->block = after_block; |
| } |
| |
| after_block->brtype = before_block->brtype; |
| after_block->condition = before_block->condition; |
| |
| return after_block; |
| } |
| |
| static void |
| link_blocks_physical(struct ir3_block *pred, struct ir3_block *succ, |
| unsigned index) |
| { |
| pred->physical_successors[index] = succ; |
| ir3_block_add_physical_predecessor(succ, pred); |
| } |
| |
| static void |
| link_blocks(struct ir3_block *pred, struct ir3_block *succ, unsigned index) |
| { |
| pred->successors[index] = succ; |
| ir3_block_add_predecessor(succ, pred); |
| link_blocks_physical(pred, succ, index); |
| } |
| |
| static struct ir3_block * |
| create_if(struct ir3 *ir, struct ir3_block *before_block, |
| struct ir3_block *after_block) |
| { |
| struct ir3_block *then_block = ir3_block_create(ir); |
| list_add(&then_block->node, &before_block->node); |
| |
| link_blocks(before_block, then_block, 0); |
| link_blocks(before_block, after_block, 1); |
| link_blocks(then_block, after_block, 0); |
| |
| return then_block; |
| } |
| |
| static bool |
| lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *instr) |
| { |
| switch (instr->opc) { |
| case OPC_BALLOT_MACRO: |
| case OPC_ANY_MACRO: |
| case OPC_ALL_MACRO: |
| case OPC_ELECT_MACRO: |
| case OPC_READ_COND_MACRO: |
| case OPC_READ_FIRST_MACRO: |
| case OPC_SWZ_SHARED_MACRO: |
| case OPC_SCAN_MACRO: |
| break; |
| default: |
| return false; |
| } |
| |
| struct ir3_block *before_block = *block; |
| struct ir3_block *after_block = split_block(ir, before_block, instr); |
| |
| if (instr->opc == OPC_SCAN_MACRO) { |
| /* The pseudo-code for the scan macro is: |
| * |
| * while (true) { |
| * header: |
| * if (elect()) { |
| * exit: |
| * exclusive = reduce; |
| * inclusive = src OP exclusive; |
| * reduce = inclusive; |
| * } |
| * footer: |
| * } |
| * |
| * This is based on the blob's sequence, and carefully crafted to avoid |
| * using the shared register "reduce" except in move instructions, since |
| * using it in the actual OP isn't possible for half-registers. |
| */ |
| struct ir3_block *header = ir3_block_create(ir); |
| list_add(&header->node, &before_block->node); |
| |
| struct ir3_block *exit = ir3_block_create(ir); |
| list_add(&exit->node, &header->node); |
| |
| struct ir3_block *footer = ir3_block_create(ir); |
| list_add(&footer->node, &exit->node); |
| |
| link_blocks(before_block, header, 0); |
| |
| link_blocks(header, exit, 0); |
| link_blocks(header, footer, 1); |
| header->brtype = IR3_BRANCH_GETONE; |
| |
| link_blocks(exit, after_block, 0); |
| link_blocks_physical(exit, footer, 1); |
| |
| link_blocks(footer, header, 0); |
| |
| struct ir3_register *exclusive = instr->dsts[0]; |
| struct ir3_register *inclusive = instr->dsts[1]; |
| struct ir3_register *reduce = instr->dsts[2]; |
| struct ir3_register *src = instr->srcs[0]; |
| |
| mov_reg(exit, exclusive, reduce); |
| do_reduce(exit, instr->cat1.reduce_op, inclusive, src, exclusive); |
| mov_reg(exit, reduce, inclusive); |
| } else { |
| struct ir3_block *then_block = create_if(ir, before_block, after_block); |
| |
| /* For ballot, the destination must be initialized to 0 before we do |
| * the movmsk because the condition may be 0 and then the movmsk will |
| * be skipped. Because it's a shared register we have to wrap the |
| * initialization in a getone block. |
| */ |
| if (instr->opc == OPC_BALLOT_MACRO) { |
| before_block->brtype = IR3_BRANCH_GETONE; |
| before_block->condition = NULL; |
| mov_immed(instr->dsts[0], then_block, 0); |
| before_block = after_block; |
| after_block = split_block(ir, before_block, instr); |
| then_block = create_if(ir, before_block, after_block); |
| } |
| |
| switch (instr->opc) { |
| case OPC_BALLOT_MACRO: |
| case OPC_READ_COND_MACRO: |
| case OPC_ANY_MACRO: |
| case OPC_ALL_MACRO: |
| before_block->condition = instr->srcs[0]->def->instr; |
| break; |
| default: |
| before_block->condition = NULL; |
| break; |
| } |
| |
| switch (instr->opc) { |
| case OPC_BALLOT_MACRO: |
| case OPC_READ_COND_MACRO: |
| before_block->brtype = IR3_BRANCH_COND; |
| break; |
| case OPC_ANY_MACRO: |
| before_block->brtype = IR3_BRANCH_ANY; |
| break; |
| case OPC_ALL_MACRO: |
| before_block->brtype = IR3_BRANCH_ALL; |
| break; |
| case OPC_ELECT_MACRO: |
| case OPC_READ_FIRST_MACRO: |
| case OPC_SWZ_SHARED_MACRO: |
| before_block->brtype = IR3_BRANCH_GETONE; |
| break; |
| default: |
| unreachable("bad opcode"); |
| } |
| |
| switch (instr->opc) { |
| case OPC_ALL_MACRO: |
| case OPC_ANY_MACRO: |
| case OPC_ELECT_MACRO: |
| mov_immed(instr->dsts[0], then_block, 1); |
| mov_immed(instr->dsts[0], before_block, 0); |
| break; |
| |
| case OPC_BALLOT_MACRO: { |
| unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask); |
| struct ir3_instruction *movmsk = |
| ir3_instr_create(then_block, OPC_MOVMSK, 1, 0); |
| ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags); |
| movmsk->repeat = comp_count - 1; |
| break; |
| } |
| |
| case OPC_READ_COND_MACRO: |
| case OPC_READ_FIRST_MACRO: { |
| struct ir3_instruction *mov = |
| ir3_instr_create(then_block, OPC_MOV, 1, 1); |
| unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0; |
| ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags); |
| struct ir3_register *new_src = ir3_src_create(mov, 0, 0); |
| *new_src = *instr->srcs[src]; |
| mov->cat1.dst_type = TYPE_U32; |
| mov->cat1.src_type = |
| (new_src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; |
| break; |
| } |
| |
| case OPC_SWZ_SHARED_MACRO: { |
| struct ir3_instruction *swz = |
| ir3_instr_create(then_block, OPC_SWZ, 2, 2); |
| ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags); |
| ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags); |
| ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags); |
| ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags); |
| swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32; |
| swz->repeat = 1; |
| break; |
| } |
| |
| default: |
| unreachable("bad opcode"); |
| } |
| } |
| |
| *block = after_block; |
| list_delinit(&instr->node); |
| return true; |
| } |
| |
| static bool |
| lower_block(struct ir3 *ir, struct ir3_block **block) |
| { |
| bool progress = true; |
| |
| bool inner_progress; |
| do { |
| inner_progress = false; |
| foreach_instr (instr, &(*block)->instr_list) { |
| if (lower_instr(ir, block, instr)) { |
| /* restart the loop with the new block we created because the |
| * iterator has been invalidated. |
| */ |
| progress = inner_progress = true; |
| break; |
| } |
| } |
| } while (inner_progress); |
| |
| return progress; |
| } |
| |
| bool |
| ir3_lower_subgroups(struct ir3 *ir) |
| { |
| bool progress = false; |
| |
| foreach_block (block, &ir->block_list) |
| progress |= lower_block(ir, &block); |
| |
| return progress; |
| } |