| /* |
| * Copyright © 2011 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "elk_vec4.h" |
| #include "elk_cfg.h" |
| #include "elk_eu.h" |
| #include "util/u_math.h" |
| |
| namespace elk { |
| |
| vec4_instruction::vec4_instruction(enum elk_opcode opcode, const dst_reg &dst, |
| const src_reg &src0, const src_reg &src1, |
| const src_reg &src2) |
| { |
| this->opcode = opcode; |
| this->dst = dst; |
| this->src[0] = src0; |
| this->src[1] = src1; |
| this->src[2] = src2; |
| this->saturate = false; |
| this->force_writemask_all = false; |
| this->no_dd_clear = false; |
| this->no_dd_check = false; |
| this->writes_accumulator = false; |
| this->conditional_mod = ELK_CONDITIONAL_NONE; |
| this->predicate = ELK_PREDICATE_NONE; |
| this->predicate_inverse = false; |
| this->target = 0; |
| this->shadow_compare = false; |
| this->eot = false; |
| this->ir = NULL; |
| this->urb_write_flags = ELK_URB_WRITE_NO_FLAGS; |
| this->header_size = 0; |
| this->flag_subreg = 0; |
| this->mlen = 0; |
| this->base_mrf = 0; |
| this->offset = 0; |
| this->exec_size = 8; |
| this->group = 0; |
| this->size_written = (dst.file == BAD_FILE ? |
| 0 : this->exec_size * type_sz(dst.type)); |
| this->annotation = NULL; |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit(vec4_instruction *inst) |
| { |
| inst->ir = this->base_ir; |
| inst->annotation = this->current_annotation; |
| |
| this->instructions.push_tail(inst); |
| |
| return inst; |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit_before(elk_bblock_t *block, vec4_instruction *inst, |
| vec4_instruction *new_inst) |
| { |
| new_inst->ir = inst->ir; |
| new_inst->annotation = inst->annotation; |
| |
| inst->insert_before(block, new_inst); |
| |
| return inst; |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0, |
| const src_reg &src1, const src_reg &src2) |
| { |
| return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2)); |
| } |
| |
| |
| vec4_instruction * |
| vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0, |
| const src_reg &src1) |
| { |
| return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1)); |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0) |
| { |
| return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0)); |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst) |
| { |
| return emit(new(mem_ctx) vec4_instruction(opcode, dst)); |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit(enum elk_opcode opcode) |
| { |
| return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg())); |
| } |
| |
| #define ALU1(op) \ |
| vec4_instruction * \ |
| vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \ |
| { \ |
| return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst, src0); \ |
| } |
| |
| #define ALU2(op) \ |
| vec4_instruction * \ |
| vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ |
| const src_reg &src1) \ |
| { \ |
| return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst, \ |
| src0, src1); \ |
| } |
| |
| #define ALU2_ACC(op) \ |
| vec4_instruction * \ |
| vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ |
| const src_reg &src1) \ |
| { \ |
| vec4_instruction *inst = new(mem_ctx) vec4_instruction( \ |
| ELK_OPCODE_##op, dst, src0, src1); \ |
| inst->writes_accumulator = true; \ |
| return inst; \ |
| } |
| |
| #define ALU3(op) \ |
| vec4_instruction * \ |
| vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ |
| const src_reg &src1, const src_reg &src2) \ |
| { \ |
| assert(devinfo->ver >= 6); \ |
| return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst, \ |
| src0, src1, src2); \ |
| } |
| |
| ALU1(NOT) |
| ALU1(MOV) |
| ALU1(FRC) |
| ALU1(RNDD) |
| ALU1(RNDE) |
| ALU1(RNDZ) |
| ALU1(F32TO16) |
| ALU1(F16TO32) |
| ALU2(ADD) |
| ALU2(MUL) |
| ALU2_ACC(MACH) |
| ALU2(AND) |
| ALU2(OR) |
| ALU2(XOR) |
| ALU2(DP3) |
| ALU2(DP4) |
| ALU2(DPH) |
| ALU2(SHL) |
| ALU2(SHR) |
| ALU2(ASR) |
| ALU3(LRP) |
| ALU1(BFREV) |
| ALU3(BFE) |
| ALU2(BFI1) |
| ALU3(BFI2) |
| ALU1(FBH) |
| ALU1(FBL) |
| ALU1(CBIT) |
| ALU1(LZD) |
| ALU3(MAD) |
| ALU2_ACC(ADDC) |
| ALU2_ACC(SUBB) |
| ALU2(MAC) |
| ALU1(DIM) |
| |
| /** Gfx4 predicated IF. */ |
| vec4_instruction * |
| vec4_visitor::IF(enum elk_predicate predicate) |
| { |
| vec4_instruction *inst; |
| |
| inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_IF); |
| inst->predicate = predicate; |
| |
| return inst; |
| } |
| |
| /** Gfx6 IF with embedded comparison. */ |
| vec4_instruction * |
| vec4_visitor::IF(src_reg src0, src_reg src1, |
| enum elk_conditional_mod condition) |
| { |
| assert(devinfo->ver == 6); |
| |
| vec4_instruction *inst; |
| |
| resolve_ud_negate(&src0); |
| resolve_ud_negate(&src1); |
| |
| inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_IF, dst_null_d(), |
| src0, src1); |
| inst->conditional_mod = condition; |
| |
| return inst; |
| } |
| |
| /** |
| * CMP: Sets the low bit of the destination channels with the result |
| * of the comparison, while the upper bits are undefined, and updates |
| * the flag register with the packed 16 bits of the result. |
| */ |
| vec4_instruction * |
| vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, |
| enum elk_conditional_mod condition) |
| { |
| vec4_instruction *inst; |
| |
| /* Take the instruction: |
| * |
| * CMP null<d> src0<f> src1<f> |
| * |
| * Original gfx4 does type conversion to the destination type before |
| * comparison, producing garbage results for floating point comparisons. |
| * |
| * The destination type doesn't matter on newer generations, so we set the |
| * type to match src0 so we can compact the instruction. |
| */ |
| dst.type = src0.type; |
| |
| resolve_ud_negate(&src0); |
| resolve_ud_negate(&src1); |
| |
| inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_CMP, dst, src0, src1); |
| inst->conditional_mod = condition; |
| |
| return inst; |
| } |
| |
| vec4_instruction * |
| vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index) |
| { |
| vec4_instruction *inst; |
| |
| inst = new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_GFX4_SCRATCH_READ, |
| dst, index); |
| inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1; |
| inst->mlen = 2; |
| |
| return inst; |
| } |
| |
| vec4_instruction * |
| vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src, |
| const src_reg &index) |
| { |
| vec4_instruction *inst; |
| |
| inst = new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE, |
| dst, src, index); |
| inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver); |
| inst->mlen = 3; |
| |
| return inst; |
| } |
| |
| src_reg |
| vec4_visitor::fix_3src_operand(const src_reg &src) |
| { |
| /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be |
| * able to use vertical stride of zero to replicate the vec4 uniform, like |
| * |
| * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] |
| * |
| * But you can't, since vertical stride is always four in three-source |
| * instructions. Instead, insert a MOV instruction to do the replication so |
| * that the three-source instruction can consume it. |
| */ |
| |
| /* The MOV is only needed if the source is a uniform or immediate. */ |
| if (src.file != UNIFORM && src.file != IMM) |
| return src; |
| |
| if (src.file == UNIFORM && elk_is_single_value_swizzle(src.swizzle)) |
| return src; |
| |
| dst_reg expanded = dst_reg(this, glsl_vec4_type()); |
| expanded.type = src.type; |
| emit(ELK_VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); |
| return src_reg(expanded); |
| } |
| |
| src_reg |
| vec4_visitor::fix_math_operand(const src_reg &src) |
| { |
| if (devinfo->ver < 6 || src.file == BAD_FILE) |
| return src; |
| |
| /* The gfx6 math instruction ignores the source modifiers -- |
| * swizzle, abs, negate, and at least some parts of the register |
| * region description. |
| * |
| * Rather than trying to enumerate all these cases, *always* expand the |
| * operand to a temp GRF for gfx6. |
| * |
| * For gfx7, keep the operand as-is, except if immediate, which gfx7 still |
| * can't use. |
| */ |
| |
| if (devinfo->ver == 7 && src.file != IMM) |
| return src; |
| |
| dst_reg expanded = dst_reg(this, glsl_vec4_type()); |
| expanded.type = src.type; |
| emit(MOV(expanded, src)); |
| return src_reg(expanded); |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit_math(enum elk_opcode opcode, |
| const dst_reg &dst, |
| const src_reg &src0, const src_reg &src1) |
| { |
| vec4_instruction *math = |
| emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1)); |
| |
| if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) { |
| /* MATH on Gfx6 must be align1, so we can't do writemasks. */ |
| math->dst = dst_reg(this, glsl_vec4_type()); |
| math->dst.type = dst.type; |
| math = emit(MOV(dst, src_reg(math->dst))); |
| } else if (devinfo->ver < 6) { |
| math->base_mrf = 1; |
| math->mlen = src1.file == BAD_FILE ? 1 : 2; |
| } |
| |
| return math; |
| } |
| |
| void |
| vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) |
| { |
| if (devinfo->ver < 7) { |
| unreachable("ir_unop_pack_half_2x16 should be lowered"); |
| } |
| |
| assert(dst.type == ELK_REGISTER_TYPE_UD); |
| assert(src0.type == ELK_REGISTER_TYPE_F); |
| |
| /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: |
| * |
| * Because this instruction does not have a 16-bit floating-point type, |
| * the destination data type must be Word (W). |
| * |
| * The destination must be DWord-aligned and specify a horizontal stride |
| * (HorzStride) of 2. The 16-bit result is stored in the lower word of |
| * each destination channel and the upper word is not modified. |
| * |
| * The above restriction implies that the f32to16 instruction must use |
| * align1 mode, because only in align1 mode is it possible to specify |
| * horizontal stride. We choose here to defy the hardware docs and emit |
| * align16 instructions. |
| * |
| * (I [chadv] did attempt to emit align1 instructions for VS f32to16 |
| * instructions. I was partially successful in that the code passed all |
| * tests. However, the code was dubiously correct and fragile, and the |
| * tests were not harsh enough to probe that frailty. Not trusting the |
| * code, I chose instead to remain in align16 mode in defiance of the hw |
| * docs). |
| * |
| * I've [chadv] experimentally confirmed that, on gfx7 hardware and the |
| * simulator, emitting a f32to16 in align16 mode with UD as destination |
| * data type is safe. The behavior differs from that specified in the PRM |
| * in that the upper word of each destination channel is cleared to 0. |
| */ |
| |
| dst_reg tmp_dst(this, glsl_uvec2_type()); |
| src_reg tmp_src(tmp_dst); |
| |
| #if 0 |
| /* Verify the undocumented behavior on which the following instructions |
| * rely. If f32to16 fails to clear the upper word of the X and Y channels, |
| * then the result of the bit-or instruction below will be incorrect. |
| * |
| * You should inspect the disasm output in order to verify that the MOV is |
| * not optimized away. |
| */ |
| emit(MOV(tmp_dst, elk_imm_ud(0x12345678u))); |
| #endif |
| |
| /* Give tmp the form below, where "." means untouched. |
| * |
| * w z y x w z y x |
| * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll| |
| * |
| * That the upper word of each write-channel be 0 is required for the |
| * following bit-shift and bit-or instructions to work. Note that this |
| * relies on the undocumented hardware behavior mentioned above. |
| */ |
| tmp_dst.writemask = WRITEMASK_XY; |
| emit(F32TO16(tmp_dst, src0)); |
| |
| /* Give the write-channels of dst the form: |
| * 0xhhhh0000 |
| */ |
| tmp_src.swizzle = ELK_SWIZZLE_YYYY; |
| emit(SHL(dst, tmp_src, elk_imm_ud(16u))); |
| |
| /* Finally, give the write-channels of dst the form of packHalf2x16's |
| * output: |
| * 0xhhhhllll |
| */ |
| tmp_src.swizzle = ELK_SWIZZLE_XXXX; |
| emit(OR(dst, src_reg(dst), tmp_src)); |
| } |
| |
| void |
| vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) |
| { |
| if (devinfo->ver < 7) { |
| unreachable("ir_unop_unpack_half_2x16 should be lowered"); |
| } |
| |
| assert(dst.type == ELK_REGISTER_TYPE_F); |
| assert(src0.type == ELK_REGISTER_TYPE_UD); |
| |
| /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: |
| * |
| * Because this instruction does not have a 16-bit floating-point type, |
| * the source data type must be Word (W). The destination type must be |
| * F (Float). |
| * |
| * To use W as the source data type, we must adjust horizontal strides, |
| * which is only possible in align1 mode. All my [chadv] attempts at |
| * emitting align1 instructions for unpackHalf2x16 failed to pass the |
| * Piglit tests, so I gave up. |
| * |
| * I've verified that, on gfx7 hardware and the simulator, it is safe to |
| * emit f16to32 in align16 mode with UD as source data type. |
| */ |
| |
| dst_reg tmp_dst(this, glsl_uvec2_type()); |
| src_reg tmp_src(tmp_dst); |
| |
| tmp_dst.writemask = WRITEMASK_X; |
| emit(AND(tmp_dst, src0, elk_imm_ud(0xffffu))); |
| |
| tmp_dst.writemask = WRITEMASK_Y; |
| emit(SHR(tmp_dst, src0, elk_imm_ud(16u))); |
| |
| dst.writemask = WRITEMASK_XY; |
| emit(F16TO32(dst, tmp_src)); |
| } |
| |
| void |
| vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) |
| { |
| /* Instead of splitting the 32-bit integer, shifting, and ORing it back |
| * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate |
| * is not suitable to generate the shift values, but we can use the packed |
| * vector float and a type-converting MOV. |
| */ |
| dst_reg shift(this, glsl_uvec4_type()); |
| emit(MOV(shift, elk_imm_vf4(0x00, 0x60, 0x70, 0x78))); |
| |
| dst_reg shifted(this, glsl_uvec4_type()); |
| src0.swizzle = ELK_SWIZZLE_XXXX; |
| emit(SHR(shifted, src0, src_reg(shift))); |
| |
| shifted.type = ELK_REGISTER_TYPE_UB; |
| dst_reg f(this, glsl_vec4_type()); |
| emit(ELK_VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); |
| |
| emit(MUL(dst, src_reg(f), elk_imm_f(1.0f / 255.0f))); |
| } |
| |
| void |
| vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0) |
| { |
| /* Instead of splitting the 32-bit integer, shifting, and ORing it back |
| * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate |
| * is not suitable to generate the shift values, but we can use the packed |
| * vector float and a type-converting MOV. |
| */ |
| dst_reg shift(this, glsl_uvec4_type()); |
| emit(MOV(shift, elk_imm_vf4(0x00, 0x60, 0x70, 0x78))); |
| |
| dst_reg shifted(this, glsl_uvec4_type()); |
| src0.swizzle = ELK_SWIZZLE_XXXX; |
| emit(SHR(shifted, src0, src_reg(shift))); |
| |
| shifted.type = ELK_REGISTER_TYPE_B; |
| dst_reg f(this, glsl_vec4_type()); |
| emit(ELK_VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); |
| |
| dst_reg scaled(this, glsl_vec4_type()); |
| emit(MUL(scaled, src_reg(f), elk_imm_f(1.0f / 127.0f))); |
| |
| dst_reg max(this, glsl_vec4_type()); |
| emit_minmax(ELK_CONDITIONAL_GE, max, src_reg(scaled), elk_imm_f(-1.0f)); |
| emit_minmax(ELK_CONDITIONAL_L, dst, src_reg(max), elk_imm_f(1.0f)); |
| } |
| |
| void |
| vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0) |
| { |
| dst_reg saturated(this, glsl_vec4_type()); |
| vec4_instruction *inst = emit(MOV(saturated, src0)); |
| inst->saturate = true; |
| |
| dst_reg scaled(this, glsl_vec4_type()); |
| emit(MUL(scaled, src_reg(saturated), elk_imm_f(255.0f))); |
| |
| dst_reg rounded(this, glsl_vec4_type()); |
| emit(RNDE(rounded, src_reg(scaled))); |
| |
| dst_reg u(this, glsl_uvec4_type()); |
| emit(MOV(u, src_reg(rounded))); |
| |
| src_reg bytes(u); |
| emit(ELK_VEC4_OPCODE_PACK_BYTES, dst, bytes); |
| } |
| |
| void |
| vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0) |
| { |
| dst_reg max(this, glsl_vec4_type()); |
| emit_minmax(ELK_CONDITIONAL_GE, max, src0, elk_imm_f(-1.0f)); |
| |
| dst_reg min(this, glsl_vec4_type()); |
| emit_minmax(ELK_CONDITIONAL_L, min, src_reg(max), elk_imm_f(1.0f)); |
| |
| dst_reg scaled(this, glsl_vec4_type()); |
| emit(MUL(scaled, src_reg(min), elk_imm_f(127.0f))); |
| |
| dst_reg rounded(this, glsl_vec4_type()); |
| emit(RNDE(rounded, src_reg(scaled))); |
| |
| dst_reg i(this, glsl_ivec4_type()); |
| emit(MOV(i, src_reg(rounded))); |
| |
| src_reg bytes(i); |
| emit(ELK_VEC4_OPCODE_PACK_BYTES, dst, bytes); |
| } |
| |
| /* |
| * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 == |
| * false) elements needed to pack a type. |
| */ |
| static int |
| elk_type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless) |
| { |
| unsigned int i; |
| int size; |
| |
| switch (type->base_type) { |
| case GLSL_TYPE_UINT: |
| case GLSL_TYPE_INT: |
| case GLSL_TYPE_FLOAT: |
| case GLSL_TYPE_FLOAT16: |
| case GLSL_TYPE_BFLOAT16: |
| case GLSL_TYPE_FLOAT_E4M3FN: |
| case GLSL_TYPE_FLOAT_E5M2: |
| case GLSL_TYPE_BOOL: |
| case GLSL_TYPE_DOUBLE: |
| case GLSL_TYPE_UINT16: |
| case GLSL_TYPE_INT16: |
| case GLSL_TYPE_UINT8: |
| case GLSL_TYPE_INT8: |
| case GLSL_TYPE_UINT64: |
| case GLSL_TYPE_INT64: |
| if (glsl_type_is_matrix(type)) { |
| const glsl_type *col_type = glsl_get_column_type(type); |
| unsigned col_slots = |
| (as_vec4 && glsl_type_is_dual_slot(col_type)) ? 2 : 1; |
| return type->matrix_columns * col_slots; |
| } else { |
| /* Regardless of size of vector, it gets a vec4. This is bad |
| * packing for things like floats, but otherwise arrays become a |
| * mess. Hopefully a later pass over the code can pack scalars |
| * down if appropriate. |
| */ |
| return (as_vec4 && glsl_type_is_dual_slot(type)) ? 2 : 1; |
| } |
| case GLSL_TYPE_ARRAY: |
| assert(type->length > 0); |
| return elk_type_size_xvec4(type->fields.array, as_vec4, bindless) * |
| type->length; |
| case GLSL_TYPE_STRUCT: |
| case GLSL_TYPE_INTERFACE: |
| size = 0; |
| for (i = 0; i < type->length; i++) { |
| size += elk_type_size_xvec4(type->fields.structure[i].type, as_vec4, |
| bindless); |
| } |
| return size; |
| case GLSL_TYPE_SUBROUTINE: |
| return 1; |
| |
| case GLSL_TYPE_SAMPLER: |
| case GLSL_TYPE_TEXTURE: |
| /* Samplers and textures take up no register space, since they're baked |
| * in at link time. |
| */ |
| return bindless ? 1 : 0; |
| case GLSL_TYPE_ATOMIC_UINT: |
| return 0; |
| case GLSL_TYPE_IMAGE: |
| return bindless ? 1 : DIV_ROUND_UP(ISL_IMAGE_PARAM_SIZE, 4); |
| case GLSL_TYPE_VOID: |
| case GLSL_TYPE_ERROR: |
| case GLSL_TYPE_COOPERATIVE_MATRIX: |
| unreachable("not reached"); |
| } |
| |
| return 0; |
| } |
| |
| /** |
| * Returns the minimum number of vec4 elements needed to pack a type. |
| * |
| * For simple types, it will return 1 (a single vec4); for matrices, the |
| * number of columns; for array and struct, the sum of the vec4_size of |
| * each of its elements; and for sampler and atomic, zero. |
| * |
| * This method is useful to calculate how much register space is needed to |
| * store a particular type. |
| */ |
| extern "C" int |
| elk_type_size_vec4(const struct glsl_type *type, bool bindless) |
| { |
| return elk_type_size_xvec4(type, true, bindless); |
| } |
| |
| /** |
| * Returns the minimum number of dvec4 elements needed to pack a type. |
| * |
| * For simple types, it will return 1 (a single dvec4); for matrices, the |
| * number of columns; for array and struct, the sum of the dvec4_size of |
| * each of its elements; and for sampler and atomic, zero. |
| * |
| * This method is useful to calculate how much register space is needed to |
| * store a particular type. |
| * |
| * Measuring double-precision vertex inputs as dvec4 is required because |
| * ARB_vertex_attrib_64bit states that these uses the same number of locations |
| * than the single-precision version. That is, two consecutives dvec4 would be |
| * located in location "x" and location "x+1", not "x+2". |
| * |
| * In order to map vec4/dvec4 vertex inputs in the proper ATTRs, |
| * remap_vs_attrs() will take in account both the location and also if the |
| * type fits in one or two vec4 slots. |
| */ |
| extern "C" int |
| elk_type_size_dvec4(const struct glsl_type *type, bool bindless) |
| { |
| return elk_type_size_xvec4(type, false, bindless); |
| } |
| |
| src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) |
| { |
| init(); |
| |
| this->file = VGRF; |
| this->nr = v->alloc.allocate(elk_type_size_vec4(type, false)); |
| |
| if (glsl_type_is_array(type) || glsl_type_is_struct(type)) { |
| this->swizzle = ELK_SWIZZLE_NOOP; |
| } else { |
| this->swizzle = elk_swizzle_for_size(type->vector_elements); |
| } |
| |
| this->type = elk_type_for_base_type(type); |
| } |
| |
| src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) |
| { |
| assert(size > 0); |
| |
| init(); |
| |
| this->file = VGRF; |
| this->nr = v->alloc.allocate(elk_type_size_vec4(type, false) * size); |
| |
| this->swizzle = ELK_SWIZZLE_NOOP; |
| |
| this->type = elk_type_for_base_type(type); |
| } |
| |
| dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) |
| { |
| init(); |
| |
| this->file = VGRF; |
| this->nr = v->alloc.allocate(elk_type_size_vec4(type, false)); |
| |
| if (glsl_type_is_array(type) || glsl_type_is_struct(type)) { |
| this->writemask = WRITEMASK_XYZW; |
| } else { |
| this->writemask = (1 << type->vector_elements) - 1; |
| } |
| |
| this->type = elk_type_for_base_type(type); |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit_minmax(enum elk_conditional_mod conditionalmod, dst_reg dst, |
| src_reg src0, src_reg src1) |
| { |
| vec4_instruction *inst = emit(ELK_OPCODE_SEL, dst, src0, src1); |
| inst->conditional_mod = conditionalmod; |
| return inst; |
| } |
| |
| /** |
| * Emits the instructions needed to perform a pull constant load. before_block |
| * and before_inst can be NULL in which case the instruction will be appended |
| * to the end of the instruction list. |
| */ |
| void |
| vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, |
| src_reg surf_index, |
| src_reg offset_reg, |
| elk_bblock_t *before_block, |
| vec4_instruction *before_inst) |
| { |
| assert((before_inst == NULL && before_block == NULL) || |
| (before_inst && before_block)); |
| |
| vec4_instruction *pull; |
| |
| if (devinfo->ver >= 7) { |
| dst_reg grf_offset = dst_reg(this, glsl_uint_type()); |
| |
| grf_offset.type = offset_reg.type; |
| |
| pull = MOV(grf_offset, offset_reg); |
| |
| if (before_inst) |
| emit_before(before_block, before_inst, pull); |
| else |
| emit(pull); |
| |
| pull = new(mem_ctx) vec4_instruction(ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7, |
| dst, |
| surf_index, |
| src_reg(grf_offset)); |
| pull->mlen = 1; |
| } else { |
| pull = new(mem_ctx) vec4_instruction(ELK_VS_OPCODE_PULL_CONSTANT_LOAD, |
| dst, |
| surf_index, |
| offset_reg); |
| pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; |
| pull->mlen = 1; |
| } |
| |
| if (before_inst) |
| emit_before(before_block, before_inst, pull); |
| else |
| emit(pull); |
| } |
| |
| src_reg |
| vec4_visitor::emit_uniformize(const src_reg &src) |
| { |
| const src_reg chan_index(this, glsl_uint_type()); |
| const dst_reg dst = retype(dst_reg(this, glsl_uint_type()), |
| src.type); |
| |
| emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index)) |
| ->force_writemask_all = true; |
| emit(ELK_SHADER_OPCODE_BROADCAST, dst, src, chan_index) |
| ->force_writemask_all = true; |
| |
| return src_reg(dst); |
| } |
| |
| void |
| vec4_visitor::gs_emit_vertex(int /* stream_id */) |
| { |
| unreachable("not reached"); |
| } |
| |
| void |
| vec4_visitor::gs_end_primitive() |
| { |
| unreachable("not reached"); |
| } |
| |
| void |
| vec4_visitor::emit_ndc_computation() |
| { |
| if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE) |
| return; |
| |
| /* Get the position */ |
| src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]); |
| |
| /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */ |
| dst_reg ndc = dst_reg(this, glsl_vec4_type()); |
| output_reg[ELK_VARYING_SLOT_NDC][0] = ndc; |
| output_num_components[ELK_VARYING_SLOT_NDC][0] = 4; |
| |
| current_annotation = "NDC"; |
| dst_reg ndc_w = ndc; |
| ndc_w.writemask = WRITEMASK_W; |
| src_reg pos_w = pos; |
| pos_w.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_W, ELK_SWIZZLE_W, ELK_SWIZZLE_W, ELK_SWIZZLE_W); |
| emit_math(ELK_SHADER_OPCODE_RCP, ndc_w, pos_w); |
| |
| dst_reg ndc_xyz = ndc; |
| ndc_xyz.writemask = WRITEMASK_XYZ; |
| |
| emit(MUL(ndc_xyz, pos, src_reg(ndc_w))); |
| } |
| |
| void |
| vec4_visitor::emit_psiz_and_flags(dst_reg reg) |
| { |
| if (devinfo->ver < 6 && |
| ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || |
| output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE || |
| compiler->has_negative_rhw_bug)) { |
| dst_reg header1 = dst_reg(this, glsl_uvec4_type()); |
| dst_reg header1_w = header1; |
| header1_w.writemask = WRITEMASK_W; |
| |
| emit(MOV(header1, elk_imm_ud(0u))); |
| |
| if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { |
| src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); |
| |
| current_annotation = "Point size"; |
| emit(MUL(header1_w, psiz, elk_imm_f((float)(1 << 11)))); |
| emit(AND(header1_w, src_reg(header1_w), elk_imm_d(0x7ff << 8))); |
| } |
| |
| if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) { |
| current_annotation = "Clipping flags"; |
| dst_reg flags0 = dst_reg(this, glsl_uint_type()); |
| |
| emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), elk_imm_f(0.0f), ELK_CONDITIONAL_L)); |
| emit(ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, elk_imm_d(0)); |
| emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); |
| } |
| |
| if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) { |
| dst_reg flags1 = dst_reg(this, glsl_uint_type()); |
| emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), elk_imm_f(0.0f), ELK_CONDITIONAL_L)); |
| emit(ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, elk_imm_d(0)); |
| emit(SHL(flags1, src_reg(flags1), elk_imm_d(4))); |
| emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); |
| } |
| |
| /* i965 clipping workaround: |
| * 1) Test for -ve rhw |
| * 2) If set, |
| * set ndc = (0,0,0,0) |
| * set ucp[6] = 1 |
| * |
| * Later, clipping will detect ucp[6] and ensure the primitive is |
| * clipped against all fixed planes. |
| */ |
| if (compiler->has_negative_rhw_bug && |
| output_reg[ELK_VARYING_SLOT_NDC][0].file != BAD_FILE) { |
| src_reg ndc_w = src_reg(output_reg[ELK_VARYING_SLOT_NDC][0]); |
| ndc_w.swizzle = ELK_SWIZZLE_WWWW; |
| emit(CMP(dst_null_f(), ndc_w, elk_imm_f(0.0f), ELK_CONDITIONAL_L)); |
| vec4_instruction *inst; |
| inst = emit(OR(header1_w, src_reg(header1_w), elk_imm_ud(1u << 6))); |
| inst->predicate = ELK_PREDICATE_NORMAL; |
| output_reg[ELK_VARYING_SLOT_NDC][0].type = ELK_REGISTER_TYPE_F; |
| inst = emit(MOV(output_reg[ELK_VARYING_SLOT_NDC][0], elk_imm_f(0.0f))); |
| inst->predicate = ELK_PREDICATE_NORMAL; |
| } |
| |
| emit(MOV(retype(reg, ELK_REGISTER_TYPE_UD), src_reg(header1))); |
| } else if (devinfo->ver < 6) { |
| emit(MOV(retype(reg, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u))); |
| } else { |
| emit(MOV(retype(reg, ELK_REGISTER_TYPE_D), elk_imm_d(0))); |
| if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) { |
| dst_reg reg_w = reg; |
| reg_w.writemask = WRITEMASK_W; |
| src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); |
| reg_as_src.type = reg_w.type; |
| reg_as_src.swizzle = elk_swizzle_for_size(1); |
| emit(MOV(reg_w, reg_as_src)); |
| } |
| if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) { |
| dst_reg reg_y = reg; |
| reg_y.writemask = WRITEMASK_Y; |
| reg_y.type = ELK_REGISTER_TYPE_D; |
| output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type; |
| emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0]))); |
| } |
| if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) { |
| dst_reg reg_z = reg; |
| reg_z.writemask = WRITEMASK_Z; |
| reg_z.type = ELK_REGISTER_TYPE_D; |
| output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type; |
| emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0]))); |
| } |
| } |
| } |
| |
| vec4_instruction * |
| vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component) |
| { |
| assert(varying < VARYING_SLOT_MAX); |
| |
| unsigned num_comps = output_num_components[varying][component]; |
| if (num_comps == 0) |
| return NULL; |
| |
| assert(output_reg[varying][component].type == reg.type); |
| current_annotation = output_reg_annotation[varying]; |
| if (output_reg[varying][component].file != BAD_FILE) { |
| src_reg src = src_reg(output_reg[varying][component]); |
| src.swizzle = ELK_SWZ_COMP_OUTPUT(component); |
| reg.writemask = |
| elk_writemask_for_component_packing(num_comps, component); |
| return emit(MOV(reg, src)); |
| } |
| return NULL; |
| } |
| |
| void |
| vec4_visitor::emit_urb_slot(dst_reg reg, int varying) |
| { |
| reg.type = ELK_REGISTER_TYPE_F; |
| output_reg[varying][0].type = reg.type; |
| |
| switch (varying) { |
| case VARYING_SLOT_PSIZ: |
| { |
| /* PSIZ is always in slot 0, and is coupled with other flags. */ |
| current_annotation = "indices, point width, clip flags"; |
| emit_psiz_and_flags(reg); |
| break; |
| } |
| case ELK_VARYING_SLOT_NDC: |
| current_annotation = "NDC"; |
| if (output_reg[ELK_VARYING_SLOT_NDC][0].file != BAD_FILE) |
| emit(MOV(reg, src_reg(output_reg[ELK_VARYING_SLOT_NDC][0]))); |
| break; |
| case VARYING_SLOT_POS: |
| current_annotation = "gl_Position"; |
| if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE) |
| emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0]))); |
| break; |
| case ELK_VARYING_SLOT_PAD: |
| /* No need to write to this slot */ |
| break; |
| default: |
| for (int i = 0; i < 4; i++) { |
| emit_generic_urb_slot(reg, varying, i); |
| } |
| break; |
| } |
| } |
| |
| static unsigned |
| align_interleaved_urb_mlen(const struct intel_device_info *devinfo, |
| unsigned mlen) |
| { |
| if (devinfo->ver >= 6) { |
| /* URB data written (does not include the message header reg) must |
| * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, |
| * section 5.4.3.2.2: URB_INTERLEAVED. |
| * |
| * URB entries are allocated on a multiple of 1024 bits, so an |
| * extra 128 bits written here to make the end align to 256 is |
| * no problem. |
| */ |
| if ((mlen % 2) != 1) |
| mlen++; |
| } |
| |
| return mlen; |
| } |
| |
| |
| /** |
| * Generates the VUE payload plus the necessary URB write instructions to |
| * output it. |
| * |
| * The VUE layout is documented in Volume 2a. |
| */ |
| void |
| vec4_visitor::emit_vertex() |
| { |
| /* MRF 0 is reserved for the debugger, so start with message header |
| * in MRF 1. |
| */ |
| int base_mrf = 1; |
| int mrf = base_mrf; |
| /* In the process of generating our URB write message contents, we |
| * may need to unspill a register or load from an array. Those |
| * reads would use MRFs 14-15. |
| */ |
| int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver); |
| |
| /* The following assertion verifies that max_usable_mrf causes an |
| * even-numbered amount of URB write data, which will meet gfx6's |
| * requirements for length alignment. |
| */ |
| assert ((max_usable_mrf - base_mrf) % 2 == 0); |
| |
| /* First mrf is the g0-based message header containing URB handles and |
| * such. |
| */ |
| emit_urb_write_header(mrf++); |
| |
| if (devinfo->ver < 6) { |
| emit_ndc_computation(); |
| } |
| |
| /* We may need to split this up into several URB writes, so do them in a |
| * loop. |
| */ |
| int slot = 0; |
| bool complete = false; |
| do { |
| /* URB offset is in URB row increments, and each of our MRFs is half of |
| * one of those, since we're doing interleaved writes. |
| */ |
| int offset = slot / 2; |
| |
| mrf = base_mrf + 1; |
| for (; slot < prog_data->vue_map.num_slots; ++slot) { |
| emit_urb_slot(dst_reg(MRF, mrf++), |
| prog_data->vue_map.slot_to_varying[slot]); |
| |
| /* If this was max_usable_mrf, we can't fit anything more into this |
| * URB WRITE. Same thing if we reached the maximum length available. |
| */ |
| if (mrf > max_usable_mrf || |
| align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > ELK_MAX_MSG_LENGTH) { |
| slot++; |
| break; |
| } |
| } |
| |
| complete = slot >= prog_data->vue_map.num_slots; |
| current_annotation = "URB write"; |
| vec4_instruction *inst = emit_urb_write_opcode(complete); |
| inst->base_mrf = base_mrf; |
| inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf); |
| inst->offset += offset; |
| } while(!complete); |
| } |
| |
| |
| src_reg |
| vec4_visitor::get_scratch_offset(elk_bblock_t *block, vec4_instruction *inst, |
| src_reg *reladdr, int reg_offset) |
| { |
| /* Because we store the values to scratch interleaved like our |
| * vertex data, we need to scale the vec4 index by 2. |
| */ |
| int message_header_scale = 2; |
| |
| /* Pre-gfx6, the message header uses byte offsets instead of vec4 |
| * (16-byte) offset units. |
| */ |
| if (devinfo->ver < 6) |
| message_header_scale *= 16; |
| |
| if (reladdr) { |
| /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have |
| * to multiply the reladdr by 2. Notice that the reg_offset part |
| * is in units of 16 bytes and is used to select the low/high 16-byte |
| * chunk of a full dvec4, so we don't want to multiply that part. |
| */ |
| src_reg index = src_reg(this, glsl_int_type()); |
| if (type_sz(inst->dst.type) < 8) { |
| emit_before(block, inst, ADD(dst_reg(index), *reladdr, |
| elk_imm_d(reg_offset))); |
| emit_before(block, inst, MUL(dst_reg(index), index, |
| elk_imm_d(message_header_scale))); |
| } else { |
| emit_before(block, inst, MUL(dst_reg(index), *reladdr, |
| elk_imm_d(message_header_scale * 2))); |
| emit_before(block, inst, ADD(dst_reg(index), index, |
| elk_imm_d(reg_offset * message_header_scale))); |
| } |
| return index; |
| } else { |
| return elk_imm_d(reg_offset * message_header_scale); |
| } |
| } |
| |
| /** |
| * Emits an instruction before @inst to load the value named by @orig_src |
| * from scratch space at @base_offset to @temp. |
| * |
| * @base_offset is measured in 32-byte units (the size of a register). |
| */ |
| void |
| vec4_visitor::emit_scratch_read(elk_bblock_t *block, vec4_instruction *inst, |
| dst_reg temp, src_reg orig_src, |
| int base_offset) |
| { |
| assert(orig_src.offset % REG_SIZE == 0); |
| int reg_offset = base_offset + orig_src.offset / REG_SIZE; |
| src_reg index = get_scratch_offset(block, inst, orig_src.reladdr, |
| reg_offset); |
| |
| if (type_sz(orig_src.type) < 8) { |
| emit_before(block, inst, SCRATCH_READ(temp, index)); |
| } else { |
| dst_reg shuffled = dst_reg(this, glsl_dvec4_type()); |
| dst_reg shuffled_float = retype(shuffled, ELK_REGISTER_TYPE_F); |
| emit_before(block, inst, SCRATCH_READ(shuffled_float, index)); |
| index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1); |
| vec4_instruction *last_read = |
| SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index); |
| emit_before(block, inst, last_read); |
| shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read); |
| } |
| } |
| |
| /** |
| * Emits an instruction after @inst to store the value to be written |
| * to @orig_dst to scratch space at @base_offset, from @temp. |
| * |
| * @base_offset is measured in 32-byte units (the size of a register). |
| */ |
| void |
| vec4_visitor::emit_scratch_write(elk_bblock_t *block, vec4_instruction *inst, |
| int base_offset) |
| { |
| assert(inst->dst.offset % REG_SIZE == 0); |
| int reg_offset = base_offset + inst->dst.offset / REG_SIZE; |
| src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, |
| reg_offset); |
| |
| /* Create a temporary register to store *inst's result in. |
| * |
| * We have to be careful in MOVing from our temporary result register in |
| * the scratch write. If we swizzle from channels of the temporary that |
| * weren't initialized, it will confuse live interval analysis, which will |
| * make spilling fail to make progress. |
| */ |
| bool is_64bit = type_sz(inst->dst.type) == 8; |
| const glsl_type *alloc_type = |
| is_64bit ? glsl_dvec4_type() : glsl_vec4_type(); |
| const src_reg temp = swizzle(retype(src_reg(this, alloc_type), |
| inst->dst.type), |
| elk_swizzle_for_mask(inst->dst.writemask)); |
| |
| if (!is_64bit) { |
| dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0), |
| inst->dst.writemask)); |
| vec4_instruction *write = SCRATCH_WRITE(dst, temp, index); |
| if (inst->opcode != ELK_OPCODE_SEL) |
| write->predicate = inst->predicate; |
| write->ir = inst->ir; |
| write->annotation = inst->annotation; |
| inst->insert_after(block, write); |
| } else { |
| dst_reg shuffled = dst_reg(this, alloc_type); |
| vec4_instruction *last = |
| shuffle_64bit_data(shuffled, temp, true, true, block, inst); |
| src_reg shuffled_float = src_reg(retype(shuffled, ELK_REGISTER_TYPE_F)); |
| |
| uint8_t mask = 0; |
| if (inst->dst.writemask & WRITEMASK_X) |
| mask |= WRITEMASK_XY; |
| if (inst->dst.writemask & WRITEMASK_Y) |
| mask |= WRITEMASK_ZW; |
| if (mask) { |
| dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0), mask)); |
| |
| vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index); |
| if (inst->opcode != ELK_OPCODE_SEL) |
| write->predicate = inst->predicate; |
| write->ir = inst->ir; |
| write->annotation = inst->annotation; |
| last->insert_after(block, write); |
| } |
| |
| mask = 0; |
| if (inst->dst.writemask & WRITEMASK_Z) |
| mask |= WRITEMASK_XY; |
| if (inst->dst.writemask & WRITEMASK_W) |
| mask |= WRITEMASK_ZW; |
| if (mask) { |
| dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0), mask)); |
| |
| src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, |
| reg_offset + 1); |
| vec4_instruction *write = |
| SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index); |
| if (inst->opcode != ELK_OPCODE_SEL) |
| write->predicate = inst->predicate; |
| write->ir = inst->ir; |
| write->annotation = inst->annotation; |
| last->insert_after(block, write); |
| } |
| } |
| |
| inst->dst.file = temp.file; |
| inst->dst.nr = temp.nr; |
| inst->dst.offset %= REG_SIZE; |
| inst->dst.reladdr = NULL; |
| } |
| |
| /** |
| * Checks if \p src and/or \p src.reladdr require a scratch read, and if so, |
| * adds the scratch read(s) before \p inst. The function also checks for |
| * recursive reladdr scratch accesses, issuing the corresponding scratch |
| * loads and rewriting reladdr references accordingly. |
| * |
| * \return \p src if it did not require a scratch load, otherwise, the |
| * register holding the result of the scratch load that the caller should |
| * use to rewrite src. |
| */ |
| src_reg |
| vec4_visitor::emit_resolve_reladdr(int scratch_loc[], elk_bblock_t *block, |
| vec4_instruction *inst, src_reg src) |
| { |
| /* Resolve recursive reladdr scratch access by calling ourselves |
| * with src.reladdr |
| */ |
| if (src.reladdr) |
| *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, |
| *src.reladdr); |
| |
| /* Now handle scratch access on src */ |
| if (src.file == VGRF && scratch_loc[src.nr] != -1) { |
| dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ? |
| glsl_dvec4_type() : glsl_vec4_type()); |
| emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); |
| src.nr = temp.nr; |
| src.offset %= REG_SIZE; |
| src.reladdr = NULL; |
| } |
| |
| return src; |
| } |
| |
| /** |
| * We can't generally support array access in GRF space, because a |
| * single instruction's destination can only span 2 contiguous |
| * registers. So, we send all GRF arrays that get variable index |
| * access to scratch space. |
| */ |
| void |
| vec4_visitor::move_grf_array_access_to_scratch() |
| { |
| int *scratch_loc = ralloc_array(NULL, int, this->alloc.count); |
| memset(scratch_loc, -1, sizeof(int) * this->alloc.count); |
| |
| /* First, calculate the set of virtual GRFs that need to be punted |
| * to scratch due to having any array access on them, and where in |
| * scratch. |
| */ |
| foreach_block_and_inst(block, vec4_instruction, inst, cfg) { |
| if (inst->dst.file == VGRF && inst->dst.reladdr) { |
| if (scratch_loc[inst->dst.nr] == -1) { |
| scratch_loc[inst->dst.nr] = last_scratch; |
| last_scratch += this->alloc.sizes[inst->dst.nr]; |
| } |
| |
| for (src_reg *iter = inst->dst.reladdr; |
| iter->reladdr; |
| iter = iter->reladdr) { |
| if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { |
| scratch_loc[iter->nr] = last_scratch; |
| last_scratch += this->alloc.sizes[iter->nr]; |
| } |
| } |
| } |
| |
| for (int i = 0 ; i < 3; i++) { |
| for (src_reg *iter = &inst->src[i]; |
| iter->reladdr; |
| iter = iter->reladdr) { |
| if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { |
| scratch_loc[iter->nr] = last_scratch; |
| last_scratch += this->alloc.sizes[iter->nr]; |
| } |
| } |
| } |
| } |
| |
| /* Now, for anything that will be accessed through scratch, rewrite |
| * it to load/store. Note that this is a _safe list walk, because |
| * we may generate a new scratch_write instruction after the one |
| * we're processing. |
| */ |
| foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { |
| /* Set up the annotation tracking for new generated instructions. */ |
| base_ir = inst->ir; |
| current_annotation = inst->annotation; |
| |
| /* First handle scratch access on the dst. Notice we have to handle |
| * the case where the dst's reladdr also points to scratch space. |
| */ |
| if (inst->dst.reladdr) |
| *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, |
| *inst->dst.reladdr); |
| |
| /* Now that we have handled any (possibly recursive) reladdr scratch |
| * accesses for dst we can safely do the scratch write for dst itself |
| */ |
| if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) |
| emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); |
| |
| /* Now handle scratch access on any src. In this case, since inst->src[i] |
| * already is a src_reg, we can just call emit_resolve_reladdr with |
| * inst->src[i] and it will take care of handling scratch loads for |
| * both src and src.reladdr (recursively). |
| */ |
| for (int i = 0 ; i < 3; i++) { |
| inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst, |
| inst->src[i]); |
| } |
| } |
| |
| ralloc_free(scratch_loc); |
| } |
| |
| void |
| vec4_visitor::resolve_ud_negate(src_reg *reg) |
| { |
| if (reg->type != ELK_REGISTER_TYPE_UD || |
| !reg->negate) |
| return; |
| |
| src_reg temp = src_reg(this, glsl_uvec4_type()); |
| emit(ELK_OPCODE_MOV, dst_reg(temp), *reg); |
| *reg = temp; |
| } |
| |
| static elk_rnd_mode |
| elk_rnd_mode_from_execution_mode(unsigned execution_mode) |
| { |
| if (nir_has_any_rounding_mode_rtne(execution_mode)) |
| return ELK_RND_MODE_RTNE; |
| if (nir_has_any_rounding_mode_rtz(execution_mode)) |
| return ELK_RND_MODE_RTZ; |
| return ELK_RND_MODE_UNSPECIFIED; |
| } |
| |
| void |
| vec4_visitor::emit_shader_float_controls_execution_mode() |
| { |
| unsigned execution_mode = this->nir->info.float_controls_execution_mode; |
| if (nir_has_any_rounding_mode_enabled(execution_mode)) { |
| elk_rnd_mode rnd = elk_rnd_mode_from_execution_mode(execution_mode); |
| const vec4_builder bld = vec4_builder(this).at_end(); |
| bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, dst_null_ud(), elk_imm_d(rnd)); |
| } |
| } |
| |
| vec4_visitor::vec4_visitor(const struct elk_compiler *compiler, |
| const struct elk_compile_params *params, |
| const struct elk_sampler_prog_key_data *key_tex, |
| struct elk_vue_prog_data *prog_data, |
| const nir_shader *shader, |
| bool no_spills, |
| bool debug_enabled) |
| : elk_backend_shader(compiler, params, shader, &prog_data->base, debug_enabled), |
| key_tex(key_tex), |
| prog_data(prog_data), |
| fail_msg(NULL), |
| first_non_payload_grf(0), |
| ubo_push_start(), |
| push_length(0), |
| live_analysis(this), performance_analysis(this), |
| no_spills(no_spills), |
| last_scratch(0) |
| { |
| this->failed = false; |
| |
| this->base_ir = NULL; |
| this->current_annotation = NULL; |
| memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation)); |
| |
| memset(this->output_num_components, 0, sizeof(this->output_num_components)); |
| |
| this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : ELK_MAX_GRF; |
| |
| this->uniforms = 0; |
| |
| this->nir_ssa_values = NULL; |
| } |
| |
| |
| void |
| vec4_visitor::fail(const char *format, ...) |
| { |
| va_list va; |
| char *msg; |
| |
| if (failed) |
| return; |
| |
| failed = true; |
| |
| va_start(va, format); |
| msg = ralloc_vasprintf(mem_ctx, format, va); |
| va_end(va); |
| msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", |
| _mesa_shader_stage_to_abbrev(stage), msg); |
| |
| this->fail_msg = msg; |
| |
| if (unlikely(debug_enabled)) { |
| fprintf(stderr, "%s", msg); |
| } |
| } |
| |
| } /* namespace elk */ |