| /* |
| * Copyright © Microsoft Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "dxil_nir.h" |
| |
| #include "nir_builder.h" |
| #include "nir_deref.h" |
| #include "nir_to_dxil.h" |
| #include "util/u_math.h" |
| |
| static void |
| cl_type_size_align(const struct glsl_type *type, unsigned *size, |
| unsigned *align) |
| { |
| *size = glsl_get_cl_size(type); |
| *align = glsl_get_cl_alignment(type); |
| } |
| |
| static void |
| extract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32, |
| unsigned dst_bit_size, |
| nir_ssa_def **dst_comps, |
| unsigned num_dst_comps) |
| { |
| unsigned step = DIV_ROUND_UP(dst_bit_size, 32); |
| unsigned comps_per32b = 32 / dst_bit_size; |
| nir_ssa_def *tmp; |
| |
| for (unsigned i = 0; i < vec32->num_components; i += step) { |
| switch (dst_bit_size) { |
| case 64: |
| tmp = nir_pack_64_2x32_split(b, nir_channel(b, vec32, i), |
| nir_channel(b, vec32, i + 1)); |
| dst_comps[i / 2] = tmp; |
| break; |
| case 32: |
| dst_comps[i] = nir_channel(b, vec32, i); |
| break; |
| case 16: |
| case 8: { |
| unsigned dst_offs = i * comps_per32b; |
| |
| tmp = nir_unpack_bits(b, nir_channel(b, vec32, i), dst_bit_size); |
| for (unsigned j = 0; j < comps_per32b && dst_offs + j < num_dst_comps; j++) |
| dst_comps[dst_offs + j] = nir_channel(b, tmp, j); |
| } |
| |
| break; |
| } |
| } |
| } |
| |
| static nir_ssa_def * |
| load_comps_to_vec32(nir_builder *b, unsigned src_bit_size, |
| nir_ssa_def **src_comps, unsigned num_src_comps) |
| { |
| unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32); |
| unsigned step = DIV_ROUND_UP(src_bit_size, 32); |
| unsigned comps_per32b = 32 / src_bit_size; |
| nir_ssa_def *vec32comps[4]; |
| |
| for (unsigned i = 0; i < num_vec32comps; i += step) { |
| switch (src_bit_size) { |
| case 64: |
| vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]); |
| vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]); |
| break; |
| case 32: |
| vec32comps[i] = src_comps[i]; |
| break; |
| case 16: |
| case 8: { |
| unsigned src_offs = i * comps_per32b; |
| |
| vec32comps[i] = nir_u2u32(b, src_comps[src_offs]); |
| for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) { |
| nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]), |
| nir_imm_int(b, j * src_bit_size)); |
| vec32comps[i] = nir_ior(b, vec32comps[i], tmp); |
| } |
| break; |
| } |
| } |
| } |
| |
| return nir_vec(b, vec32comps, num_vec32comps); |
| } |
| |
| static nir_ssa_def * |
| build_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx) |
| { |
| return nir_load_ptr_dxil(b, 1, 32, &deref->dest.ssa, idx); |
| } |
| |
| static bool |
| lower_load_deref(nir_builder *b, nir_intrinsic_instr *intr) |
| { |
| assert(intr->dest.is_ssa); |
| |
| b->cursor = nir_before_instr(&intr->instr); |
| |
| nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); |
| if (!nir_deref_mode_is(deref, nir_var_shader_temp)) |
| return false; |
| nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align)); |
| nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3))); |
| |
| assert(intr->dest.is_ssa); |
| unsigned num_components = nir_dest_num_components(intr->dest); |
| unsigned bit_size = nir_dest_bit_size(intr->dest); |
| unsigned load_size = MAX2(32, bit_size); |
| unsigned num_bits = num_components * bit_size; |
| nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; |
| unsigned comp_idx = 0; |
| |
| nir_deref_path path; |
| nir_deref_path_init(&path, deref, NULL); |
| nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */)); |
| |
| /* Split loads into 32-bit chunks */ |
| for (unsigned i = 0; i < num_bits; i += load_size) { |
| unsigned subload_num_bits = MIN2(num_bits - i, load_size); |
| nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32)); |
| nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx); |
| |
| if (load_size == 64) { |
| idx = nir_iadd(b, idx, nir_imm_int(b, 1)); |
| vec32 = nir_vec2(b, vec32, |
| build_load_ptr_dxil(b, path.path[0], idx)); |
| } |
| |
| /* If we have 2 bytes or less to load we need to adjust the u32 value so |
| * we can always extract the LSB. |
| */ |
| if (subload_num_bits <= 16) { |
| nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)), |
| nir_imm_int(b, 8)); |
| vec32 = nir_ushr(b, vec32, shift); |
| } |
| |
| /* And now comes the pack/unpack step to match the original type. */ |
| extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], |
| subload_num_bits / bit_size); |
| comp_idx += subload_num_bits / bit_size; |
| } |
| |
| nir_deref_path_finish(&path); |
| assert(comp_idx == num_components); |
| nir_ssa_def *result = nir_vec(b, comps, num_components); |
| nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); |
| nir_instr_remove(&intr->instr); |
| return true; |
| } |
| |
| static nir_ssa_def * |
| ubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32, |
| nir_ssa_def *offset, unsigned num_bytes) |
| { |
| assert(num_bytes == 16 || num_bytes == 12 || num_bytes == 8 || |
| num_bytes == 4 || num_bytes == 3 || num_bytes == 2 || |
| num_bytes == 1); |
| assert(vec32->num_components == 4); |
| |
| /* 16 and 12 byte types are always aligned on 16 bytes. */ |
| if (num_bytes > 8) |
| return vec32; |
| |
| nir_ssa_def *comps[4]; |
| nir_ssa_def *cond; |
| |
| for (unsigned i = 0; i < 4; i++) |
| comps[i] = nir_channel(b, vec32, i); |
| |
| /* If we have 8bytes or less to load, select which half the vec4 should |
| * be used. |
| */ |
| cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)), |
| nir_imm_int(b, 0)); |
| |
| comps[0] = nir_bcsel(b, cond, comps[2], comps[0]); |
| comps[1] = nir_bcsel(b, cond, comps[3], comps[1]); |
| |
| /* Thanks to the CL alignment constraints, if we want 8 bytes we're done. */ |
| if (num_bytes == 8) |
| return nir_vec(b, comps, 2); |
| |
| /* 4 bytes or less needed, select which of the 32bit component should be |
| * used and return it. The sub-32bit split is handled in |
| * extract_comps_from_vec32(). |
| */ |
| cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)), |
| nir_imm_int(b, 0)); |
| return nir_bcsel(b, cond, comps[1], comps[0]); |
| } |
| |
| nir_ssa_def * |
| build_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer, |
| nir_ssa_def *offset, unsigned num_components, |
| unsigned bit_size) |
| { |
| nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4)); |
| nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; |
| unsigned num_bits = num_components * bit_size; |
| unsigned comp_idx = 0; |
| |
| /* We need to split loads in 16byte chunks because that's the |
| * granularity of cBufferLoadLegacy(). |
| */ |
| for (unsigned i = 0; i < num_bits; i += (16 * 8)) { |
| /* For each 16byte chunk (or smaller) we generate a 32bit ubo vec |
| * load. |
| */ |
| unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8); |
| nir_ssa_def *vec32 = |
| nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8)))); |
| |
| /* First re-arrange the vec32 to account for intra 16-byte offset. */ |
| vec32 = ubo_load_select_32b_comps(b, vec32, offset, subload_num_bits / 8); |
| |
| /* If we have 2 bytes or less to load we need to adjust the u32 value so |
| * we can always extract the LSB. |
| */ |
| if (subload_num_bits <= 16) { |
| nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset, |
| nir_imm_int(b, 3)), |
| nir_imm_int(b, 8)); |
| vec32 = nir_ushr(b, vec32, shift); |
| } |
| |
| /* And now comes the pack/unpack step to match the original type. */ |
| extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], |
| subload_num_bits / bit_size); |
| comp_idx += subload_num_bits / bit_size; |
| } |
| |
| assert(comp_idx == num_components); |
| return nir_vec(b, comps, num_components); |
| } |
| |
| static bool |
| lower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr) |
| { |
| assert(intr->dest.is_ssa); |
| assert(intr->src[0].is_ssa); |
| assert(intr->src[1].is_ssa); |
| |
| b->cursor = nir_before_instr(&intr->instr); |
| |
| nir_ssa_def *buffer = intr->src[0].ssa; |
| nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3)); |
| enum gl_access_qualifier access = nir_intrinsic_access(intr); |
| unsigned bit_size = nir_dest_bit_size(intr->dest); |
| unsigned num_components = nir_dest_num_components(intr->dest); |
| unsigned num_bits = num_components * bit_size; |
| |
| nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; |
| unsigned comp_idx = 0; |
| |
| /* We need to split loads in 16byte chunks because that's the optimal |
| * granularity of bufferLoad(). Minimum alignment is 4byte, which saves |
| * from us from extra complexity to extract >= 32 bit components. |
| */ |
| for (unsigned i = 0; i < num_bits; i += 4 * 32) { |
| /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec |
| * load. |
| */ |
| unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32); |
| |
| /* The number of components to store depends on the number of bytes. */ |
| nir_ssa_def *vec32 = |
| nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, 32), 32, |
| buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)), |
| .align_mul = 4, |
| .align_offset = 0, |
| .access = access); |
| |
| /* If we have 2 bytes or less to load we need to adjust the u32 value so |
| * we can always extract the LSB. |
| */ |
| if (subload_num_bits <= 16) { |
| nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)), |
| nir_imm_int(b, 8)); |
| vec32 = nir_ushr(b, vec32, shift); |
| } |
| |
| /* And now comes the pack/unpack step to match the original type. */ |
| extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], |
| subload_num_bits / bit_size); |
| comp_idx += subload_num_bits / bit_size; |
| } |
| |
| assert(comp_idx == num_components); |
| nir_ssa_def *result = nir_vec(b, comps, num_components); |
| nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); |
| nir_instr_remove(&intr->instr); |
| return true; |
| } |
| |
| static bool |
| lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr) |
| { |
| b->cursor = nir_before_instr(&intr->instr); |
| |
| assert(intr->src[0].is_ssa); |
| assert(intr->src[1].is_ssa); |
| assert(intr->src[2].is_ssa); |
| |
| nir_ssa_def *val = intr->src[0].ssa; |
| nir_ssa_def *buffer = intr->src[1].ssa; |
| nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3)); |
| |
| unsigned bit_size = val->bit_size; |
| unsigned num_components = val->num_components; |
| unsigned num_bits = num_components * bit_size; |
| |
| nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; |
| unsigned comp_idx = 0; |
| |
| for (unsigned i = 0; i < num_components; i++) |
| comps[i] = nir_channel(b, val, i); |
| |
| /* We split stores in 16byte chunks because that's the optimal granularity |
| * of bufferStore(). Minimum alignment is 4byte, which saves from us from |
| * extra complexity to store >= 32 bit components. |
| */ |
| for (unsigned i = 0; i < num_bits; i += 4 * 32) { |
| /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec |
| * store. |
| */ |
| unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32); |
| nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8)); |
| nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx], |
| substore_num_bits / bit_size); |
| nir_intrinsic_instr *store; |
| |
| if (substore_num_bits < 32) { |
| nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1); |
| |
| /* If we have 16 bits or less to store we need to place them |
| * correctly in the u32 component. Anything greater than 16 bits |
| * (including uchar3) is naturally aligned on 32bits. |
| */ |
| if (substore_num_bits <= 16) { |
| nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3)); |
| nir_ssa_def *shift = nir_imul_imm(b, pos, 8); |
| |
| vec32 = nir_ishl(b, vec32, shift); |
| mask = nir_ishl(b, mask, shift); |
| } |
| |
| store = nir_intrinsic_instr_create(b->shader, |
| nir_intrinsic_store_ssbo_masked_dxil); |
| store->src[0] = nir_src_for_ssa(vec32); |
| store->src[1] = nir_src_for_ssa(nir_inot(b, mask)); |
| store->src[2] = nir_src_for_ssa(buffer); |
| store->src[3] = nir_src_for_ssa(local_offset); |
| } else { |
| store = nir_intrinsic_instr_create(b->shader, |
| nir_intrinsic_store_ssbo); |
| store->src[0] = nir_src_for_ssa(vec32); |
| store->src[1] = nir_src_for_ssa(buffer); |
| store->src[2] = nir_src_for_ssa(local_offset); |
| |
| nir_intrinsic_set_align(store, 4, 0); |
| } |
| |
| /* The number of components to store depends on the number of bits. */ |
| store->num_components = DIV_ROUND_UP(substore_num_bits, 32); |
| nir_builder_instr_insert(b, &store->instr); |
| comp_idx += substore_num_bits / bit_size; |
| } |
| |
| nir_instr_remove(&intr->instr); |
| return true; |
| } |
| |
| static void |
| lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op) |
| { |
| for (unsigned i = 0; i < num_comps; i++) { |
| nir_intrinsic_instr *load = |
| nir_intrinsic_instr_create(b->shader, op); |
| |
| load->num_components = 1; |
| load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); |
| nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); |
| nir_builder_instr_insert(b, &load->instr); |
| comps[i] = &load->dest.ssa; |
| } |
| } |
| |
| static bool |
| lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) |
| { |
| assert(intr->dest.is_ssa); |
| unsigned bit_size = nir_dest_bit_size(intr->dest); |
| unsigned num_components = nir_dest_num_components(intr->dest); |
| unsigned num_bits = num_components * bit_size; |
| |
| b->cursor = nir_before_instr(&intr->instr); |
| nir_intrinsic_op op = intr->intrinsic; |
| |
| assert(intr->src[0].is_ssa); |
| nir_ssa_def *offset = intr->src[0].ssa; |
| if (op == nir_intrinsic_load_shared) { |
| offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); |
| op = nir_intrinsic_load_shared_dxil; |
| } else { |
| offset = nir_u2u32(b, offset); |
| op = nir_intrinsic_load_scratch_dxil; |
| } |
| nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); |
| nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; |
| nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2]; |
| |
| /* We need to split loads in 32-bit accesses because the buffer |
| * is an i32 array and DXIL does not support type casts. |
| */ |
| unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32); |
| lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op); |
| unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4); |
| |
| for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) { |
| unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4); |
| unsigned num_dest_comps = num_vec32_comps * 32 / bit_size; |
| nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps); |
| |
| /* If we have 16 bits or less to load we need to adjust the u32 value so |
| * we can always extract the LSB. |
| */ |
| if (num_bits <= 16) { |
| nir_ssa_def *shift = |
| nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)), |
| nir_imm_int(b, 8)); |
| vec32 = nir_ushr(b, vec32, shift); |
| } |
| |
| /* And now comes the pack/unpack step to match the original type. */ |
| unsigned dest_index = i * 32 / bit_size; |
| extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps); |
| } |
| |
| nir_ssa_def *result = nir_vec(b, comps, num_components); |
| nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); |
| nir_instr_remove(&intr->instr); |
| |
| return true; |
| } |
| |
| static void |
| lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op) |
| { |
| |
| for (unsigned i = 0; i < vec32->num_components; i++) { |
| nir_intrinsic_instr *store = |
| nir_intrinsic_instr_create(b->shader, op); |
| |
| store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i)); |
| store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); |
| store->num_components = 1; |
| nir_builder_instr_insert(b, &store->instr); |
| } |
| } |
| |
| static void |
| lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index, |
| nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op) |
| { |
| nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1); |
| |
| /* If we have 16 bits or less to store we need to place them correctly in |
| * the u32 component. Anything greater than 16 bits (including uchar3) is |
| * naturally aligned on 32bits. |
| */ |
| if (num_bits <= 16) { |
| nir_ssa_def *shift = |
| nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8); |
| |
| vec32 = nir_ishl(b, vec32, shift); |
| mask = nir_ishl(b, mask, shift); |
| } |
| |
| if (op == nir_intrinsic_store_shared_dxil) { |
| /* Use the dedicated masked intrinsic */ |
| nir_store_shared_masked_dxil(b, vec32, nir_inot(b, mask), index); |
| } else { |
| /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */ |
| nir_ssa_def *load = nir_load_scratch_dxil(b, 1, 32, index); |
| |
| nir_ssa_def *new_val = nir_ior(b, vec32, |
| nir_iand(b, |
| nir_inot(b, mask), |
| load)); |
| |
| lower_store_vec32(b, index, new_val, op); |
| } |
| } |
| |
| static bool |
| lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr) |
| { |
| assert(intr->src[0].is_ssa); |
| unsigned num_components = nir_src_num_components(intr->src[0]); |
| unsigned bit_size = nir_src_bit_size(intr->src[0]); |
| unsigned num_bits = num_components * bit_size; |
| |
| b->cursor = nir_before_instr(&intr->instr); |
| nir_intrinsic_op op = intr->intrinsic; |
| |
| nir_ssa_def *offset = intr->src[1].ssa; |
| if (op == nir_intrinsic_store_shared) { |
| offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); |
| op = nir_intrinsic_store_shared_dxil; |
| } else { |
| offset = nir_u2u32(b, offset); |
| op = nir_intrinsic_store_scratch_dxil; |
| } |
| nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; |
| |
| unsigned comp_idx = 0; |
| for (unsigned i = 0; i < num_components; i++) |
| comps[i] = nir_channel(b, intr->src[0].ssa, i); |
| |
| for (unsigned i = 0; i < num_bits; i += 4 * 32) { |
| /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. |
| */ |
| unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32); |
| nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8)); |
| nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx], |
| substore_num_bits / bit_size); |
| nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2)); |
| |
| /* For anything less than 32bits we need to use the masked version of the |
| * intrinsic to preserve data living in the same 32bit slot. |
| */ |
| if (num_bits < 32) { |
| lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op); |
| } else { |
| lower_store_vec32(b, index, vec32, op); |
| } |
| |
| comp_idx += substore_num_bits / bit_size; |
| } |
| |
| nir_instr_remove(&intr->instr); |
| |
| return true; |
| } |
| |
| static void |
| ubo_to_temp_patch_deref_mode(nir_deref_instr *deref) |
| { |
| deref->modes = nir_var_shader_temp; |
| nir_foreach_use(use_src, &deref->dest.ssa) { |
| if (use_src->parent_instr->type != nir_instr_type_deref) |
| continue; |
| |
| nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr); |
| ubo_to_temp_patch_deref_mode(parent); |
| } |
| } |
| |
| static void |
| ubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he) |
| { |
| assert(nir_deref_mode_is(deref, nir_var_mem_constant)); |
| assert(deref->dest.is_ssa); |
| assert(he->data); |
| |
| nir_foreach_use(use_src, &deref->dest.ssa) { |
| if (use_src->parent_instr->type == nir_instr_type_deref) { |
| ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he); |
| } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) { |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr); |
| if (intr->intrinsic != nir_intrinsic_load_deref) |
| he->data = NULL; |
| } else { |
| he->data = NULL; |
| } |
| |
| if (!he->data) |
| break; |
| } |
| } |
| |
| bool |
| dxil_nir_lower_ubo_to_temp(nir_shader *nir) |
| { |
| struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL); |
| bool progress = false; |
| |
| /* First pass: collect all UBO accesses that could be turned into |
| * shader temp accesses. |
| */ |
| foreach_list_typed(nir_function, func, node, &nir->functions) { |
| if (!func->is_entrypoint) |
| continue; |
| assert(func->impl); |
| |
| nir_foreach_block(block, func->impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_deref) |
| continue; |
| |
| nir_deref_instr *deref = nir_instr_as_deref(instr); |
| if (!nir_deref_mode_is(deref, nir_var_mem_constant) || |
| deref->deref_type != nir_deref_type_var) |
| continue; |
| |
| struct hash_entry *he = |
| _mesa_hash_table_search(ubo_to_temp, deref->var); |
| |
| if (!he) |
| he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var); |
| |
| if (!he->data) |
| continue; |
| |
| ubo_to_temp_update_entry(deref, he); |
| } |
| } |
| } |
| |
| hash_table_foreach(ubo_to_temp, he) { |
| nir_variable *var = he->data; |
| |
| if (!var) |
| continue; |
| |
| /* Change the variable mode. */ |
| var->data.mode = nir_var_shader_temp; |
| |
| /* Make sure the variable has a name. |
| * DXIL variables must have names. |
| */ |
| if (!var->name) |
| var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables)); |
| |
| progress = true; |
| } |
| _mesa_hash_table_destroy(ubo_to_temp, NULL); |
| |
| /* Second pass: patch all derefs that were accessing the converted UBOs |
| * variables. |
| */ |
| foreach_list_typed(nir_function, func, node, &nir->functions) { |
| if (!func->is_entrypoint) |
| continue; |
| assert(func->impl); |
| |
| nir_foreach_block(block, func->impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_deref) |
| continue; |
| |
| nir_deref_instr *deref = nir_instr_as_deref(instr); |
| if (nir_deref_mode_is(deref, nir_var_mem_constant) && |
| deref->deref_type == nir_deref_type_var && |
| deref->var->data.mode == nir_var_shader_temp) |
| ubo_to_temp_patch_deref_mode(deref); |
| } |
| } |
| } |
| |
| return progress; |
| } |
| |
| static bool |
| lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr) |
| { |
| assert(intr->dest.is_ssa); |
| assert(intr->src[0].is_ssa); |
| assert(intr->src[1].is_ssa); |
| |
| b->cursor = nir_before_instr(&intr->instr); |
| |
| nir_ssa_def *result = |
| build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa, |
| nir_dest_num_components(intr->dest), |
| nir_dest_bit_size(intr->dest)); |
| |
| nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); |
| nir_instr_remove(&intr->instr); |
| return true; |
| } |
| |
| bool |
| dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir) |
| { |
| bool progress = false; |
| |
| foreach_list_typed(nir_function, func, node, &nir->functions) { |
| if (!func->is_entrypoint) |
| continue; |
| assert(func->impl); |
| |
| nir_builder b; |
| nir_builder_init(&b, func->impl); |
| |
| nir_foreach_block(block, func->impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| switch (intr->intrinsic) { |
| case nir_intrinsic_load_deref: |
| progress |= lower_load_deref(&b, intr); |
| break; |
| case nir_intrinsic_load_shared: |
| case nir_intrinsic_load_scratch: |
| progress |= lower_32b_offset_load(&b, intr); |
| break; |
| case nir_intrinsic_load_ssbo: |
| progress |= lower_load_ssbo(&b, intr); |
| break; |
| case nir_intrinsic_load_ubo: |
| progress |= lower_load_ubo(&b, intr); |
| break; |
| case nir_intrinsic_store_shared: |
| case nir_intrinsic_store_scratch: |
| progress |= lower_32b_offset_store(&b, intr); |
| break; |
| case nir_intrinsic_store_ssbo: |
| progress |= lower_store_ssbo(&b, intr); |
| break; |
| default: |
| break; |
| } |
| } |
| } |
| } |
| |
| return progress; |
| } |
| |
| static bool |
| lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, |
| nir_intrinsic_op dxil_op) |
| { |
| b->cursor = nir_before_instr(&intr->instr); |
| |
| assert(intr->src[0].is_ssa); |
| nir_ssa_def *offset = |
| nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr))); |
| nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); |
| |
| nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op); |
| atomic->src[0] = nir_src_for_ssa(index); |
| assert(intr->src[1].is_ssa); |
| atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa); |
| if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) { |
| assert(intr->src[2].is_ssa); |
| atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa); |
| } |
| atomic->num_components = 0; |
| nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL); |
| |
| nir_builder_instr_insert(b, &atomic->instr); |
| nir_ssa_def_rewrite_uses(&intr->dest.ssa, &atomic->dest.ssa); |
| nir_instr_remove(&intr->instr); |
| return true; |
| } |
| |
| bool |
| dxil_nir_lower_atomics_to_dxil(nir_shader *nir) |
| { |
| bool progress = false; |
| |
| foreach_list_typed(nir_function, func, node, &nir->functions) { |
| if (!func->is_entrypoint) |
| continue; |
| assert(func->impl); |
| |
| nir_builder b; |
| nir_builder_init(&b, func->impl); |
| |
| nir_foreach_block(block, func->impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| switch (intr->intrinsic) { |
| |
| #define ATOMIC(op) \ |
| case nir_intrinsic_shared_atomic_##op: \ |
| progress |= lower_shared_atomic(&b, intr, \ |
| nir_intrinsic_shared_atomic_##op##_dxil); \ |
| break |
| |
| ATOMIC(add); |
| ATOMIC(imin); |
| ATOMIC(umin); |
| ATOMIC(imax); |
| ATOMIC(umax); |
| ATOMIC(and); |
| ATOMIC(or); |
| ATOMIC(xor); |
| ATOMIC(exchange); |
| ATOMIC(comp_swap); |
| |
| #undef ATOMIC |
| default: |
| break; |
| } |
| } |
| } |
| } |
| |
| return progress; |
| } |
| |
| static bool |
| lower_deref_ssbo(nir_builder *b, nir_deref_instr *deref) |
| { |
| assert(nir_deref_mode_is(deref, nir_var_mem_ssbo)); |
| assert(deref->deref_type == nir_deref_type_var || |
| deref->deref_type == nir_deref_type_cast); |
| nir_variable *var = deref->var; |
| |
| b->cursor = nir_before_instr(&deref->instr); |
| |
| if (deref->deref_type == nir_deref_type_var) { |
| /* We turn all deref_var into deref_cast and build a pointer value based on |
| * the var binding which encodes the UAV id. |
| */ |
| nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32); |
| nir_deref_instr *deref_cast = |
| nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type, |
| glsl_get_explicit_stride(var->type)); |
| nir_ssa_def_rewrite_uses(&deref->dest.ssa, |
| &deref_cast->dest.ssa); |
| nir_instr_remove(&deref->instr); |
| |
| deref = deref_cast; |
| return true; |
| } |
| return false; |
| } |
| |
| bool |
| dxil_nir_lower_deref_ssbo(nir_shader *nir) |
| { |
| bool progress = false; |
| |
| foreach_list_typed(nir_function, func, node, &nir->functions) { |
| if (!func->is_entrypoint) |
| continue; |
| assert(func->impl); |
| |
| nir_builder b; |
| nir_builder_init(&b, func->impl); |
| |
| nir_foreach_block(block, func->impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_deref) |
| continue; |
| |
| nir_deref_instr *deref = nir_instr_as_deref(instr); |
| |
| if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) || |
| (deref->deref_type != nir_deref_type_var && |
| deref->deref_type != nir_deref_type_cast)) |
| continue; |
| |
| progress |= lower_deref_ssbo(&b, deref); |
| } |
| } |
| } |
| |
| return progress; |
| } |
| |
| static bool |
| lower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu) |
| { |
| const nir_op_info *info = &nir_op_infos[alu->op]; |
| bool progress = false; |
| |
| b->cursor = nir_before_instr(&alu->instr); |
| |
| for (unsigned i = 0; i < info->num_inputs; i++) { |
| nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src); |
| |
| if (!deref) |
| continue; |
| |
| nir_deref_path path; |
| nir_deref_path_init(&path, deref, NULL); |
| nir_deref_instr *root_deref = path.path[0]; |
| nir_deref_path_finish(&path); |
| |
| if (root_deref->deref_type != nir_deref_type_cast) |
| continue; |
| |
| nir_ssa_def *ptr = |
| nir_iadd(b, root_deref->parent.ssa, |
| nir_build_deref_offset(b, deref, cl_type_size_align)); |
| nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr)); |
| progress = true; |
| } |
| |
| return progress; |
| } |
| |
| bool |
| dxil_nir_opt_alu_deref_srcs(nir_shader *nir) |
| { |
| bool progress = false; |
| |
| foreach_list_typed(nir_function, func, node, &nir->functions) { |
| if (!func->is_entrypoint) |
| continue; |
| assert(func->impl); |
| |
| bool progress = false; |
| nir_builder b; |
| nir_builder_init(&b, func->impl); |
| |
| nir_foreach_block(block, func->impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_alu) |
| continue; |
| |
| nir_alu_instr *alu = nir_instr_as_alu(instr); |
| progress |= lower_alu_deref_srcs(&b, alu); |
| } |
| } |
| } |
| |
| return progress; |
| } |
| |
| static nir_ssa_def * |
| memcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent, |
| nir_ssa_def *index) |
| { |
| nir_deref_instr *deref; |
| |
| index = nir_i2i(b, index, nir_dest_bit_size(parent->dest)); |
| assert(parent->deref_type == nir_deref_type_cast); |
| deref = nir_build_deref_ptr_as_array(b, parent, index); |
| |
| return nir_load_deref(b, deref); |
| } |
| |
| static void |
| memcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent, |
| nir_ssa_def *index, nir_ssa_def *value) |
| { |
| nir_deref_instr *deref; |
| |
| index = nir_i2i(b, index, nir_dest_bit_size(parent->dest)); |
| assert(parent->deref_type == nir_deref_type_cast); |
| deref = nir_build_deref_ptr_as_array(b, parent, index); |
| nir_store_deref(b, deref, value, 1); |
| } |
| |
| static bool |
| lower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr) |
| { |
| nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]); |
| nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]); |
| assert(intr->src[2].is_ssa); |
| nir_ssa_def *num_bytes = intr->src[2].ssa; |
| |
| assert(dst_deref && src_deref); |
| |
| b->cursor = nir_after_instr(&intr->instr); |
| |
| dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes, |
| glsl_uint8_t_type(), 1); |
| src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes, |
| glsl_uint8_t_type(), 1); |
| |
| /* |
| * We want to avoid 64b instructions, so let's assume we'll always be |
| * passed a value that fits in a 32b type and truncate the 64b value. |
| */ |
| num_bytes = nir_u2u32(b, num_bytes); |
| |
| nir_variable *loop_index_var = |
| nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index"); |
| nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var); |
| nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1); |
| |
| nir_loop *loop = nir_push_loop(b); |
| nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref); |
| nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes); |
| nir_if *loop_check = nir_push_if(b, cmp); |
| nir_jump(b, nir_jump_break); |
| nir_pop_if(b, loop_check); |
| nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index); |
| memcpy_store_deref_elem(b, dst_deref, loop_index, val); |
| nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1); |
| nir_pop_loop(b, loop); |
| nir_instr_remove(&intr->instr); |
| return true; |
| } |
| |
| bool |
| dxil_nir_lower_memcpy_deref(nir_shader *nir) |
| { |
| bool progress = false; |
| |
| foreach_list_typed(nir_function, func, node, &nir->functions) { |
| if (!func->is_entrypoint) |
| continue; |
| assert(func->impl); |
| |
| nir_builder b; |
| nir_builder_init(&b, func->impl); |
| |
| nir_foreach_block(block, func->impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic) |
| continue; |
| |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| if (intr->intrinsic == nir_intrinsic_memcpy_deref) |
| progress |= lower_memcpy_deref(&b, intr); |
| } |
| } |
| } |
| |
| return progress; |
| } |
| |
| static void |
| cast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size) |
| { |
| nir_phi_instr *lowered = nir_phi_instr_create(b->shader); |
| int num_components = 0; |
| int old_bit_size = phi->dest.ssa.bit_size; |
| |
| nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size, |
| nir_type_uint | new_bit_size, |
| nir_rounding_mode_undef); |
| nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size, |
| nir_type_uint | old_bit_size, |
| nir_rounding_mode_undef); |
| |
| nir_foreach_phi_src(src, phi) { |
| assert(num_components == 0 || num_components == src->src.ssa->num_components); |
| num_components = src->src.ssa->num_components; |
| |
| b->cursor = nir_after_instr_and_phis(src->src.ssa->parent_instr); |
| |
| nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL); |
| nir_phi_instr_add_src(lowered, src->pred, nir_src_for_ssa(cast)); |
| } |
| |
| nir_ssa_dest_init(&lowered->instr, &lowered->dest, |
| num_components, new_bit_size, NULL); |
| |
| b->cursor = nir_before_instr(&phi->instr); |
| nir_builder_instr_insert(b, &lowered->instr); |
| |
| b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor)); |
| nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL); |
| |
| nir_ssa_def_rewrite_uses(&phi->dest.ssa, result); |
| nir_instr_remove(&phi->instr); |
| } |
| |
| static bool |
| upcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size) |
| { |
| nir_builder b; |
| nir_builder_init(&b, impl); |
| bool progress = false; |
| |
| nir_foreach_block_reverse(block, impl) { |
| nir_foreach_instr_safe(instr, block) { |
| if (instr->type != nir_instr_type_phi) |
| continue; |
| |
| nir_phi_instr *phi = nir_instr_as_phi(instr); |
| assert(phi->dest.is_ssa); |
| |
| if (phi->dest.ssa.bit_size == 1 || |
| phi->dest.ssa.bit_size >= min_bit_size) |
| continue; |
| |
| cast_phi(&b, phi, min_bit_size); |
| progress = true; |
| } |
| } |
| |
| if (progress) { |
| nir_metadata_preserve(impl, nir_metadata_block_index | |
| nir_metadata_dominance); |
| } else { |
| nir_metadata_preserve(impl, nir_metadata_all); |
| } |
| |
| return progress; |
| } |
| |
| bool |
| dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size) |
| { |
| bool progress = false; |
| |
| nir_foreach_function(function, shader) { |
| if (function->impl) |
| progress |= upcast_phi_impl(function->impl, min_bit_size); |
| } |
| |
| return progress; |
| } |
| |
| struct dxil_nir_split_clip_cull_distance_params { |
| nir_variable *new_var; |
| nir_shader *shader; |
| }; |
| |
| /* In GLSL and SPIR-V, clip and cull distance are arrays of floats (with a limit of 8). |
| * In DXIL, clip and cull distances are up to 2 float4s combined. |
| * Coming from GLSL, we can request this 2 float4 format, but coming from SPIR-V, |
| * we can't, and have to accept a "compact" array of scalar floats. |
| * |
| * To help emitting a valid input signature for this case, split the variables so that they |
| * match what we need to put in the signature (e.g. { float clip[4]; float clip1; float cull[3]; }) |
| */ |
| static bool |
| dxil_nir_split_clip_cull_distance_instr(nir_builder *b, |
| nir_instr *instr, |
| void *cb_data) |
| { |
| struct dxil_nir_split_clip_cull_distance_params *params = cb_data; |
| nir_variable *new_var = params->new_var; |
| |
| if (instr->type != nir_instr_type_deref) |
| return false; |
| |
| nir_deref_instr *deref = nir_instr_as_deref(instr); |
| nir_variable *var = nir_deref_instr_get_variable(deref); |
| if (!var || |
| var->data.location < VARYING_SLOT_CLIP_DIST0 || |
| var->data.location > VARYING_SLOT_CULL_DIST1 || |
| !var->data.compact) |
| return false; |
| |
| /* The location should only be inside clip distance, because clip |
| * and cull should've been merged by nir_lower_clip_cull_distance_arrays() |
| */ |
| assert(var->data.location == VARYING_SLOT_CLIP_DIST0 || |
| var->data.location == VARYING_SLOT_CLIP_DIST1); |
| |
| /* The deref chain to the clip/cull variables should be simple, just the |
| * var and an array with a constant index, otherwise more lowering/optimization |
| * might be needed before this pass, e.g. copy prop, lower_io_to_temporaries, |
| * split_var_copies, and/or lower_var_copies |
| */ |
| assert(deref->deref_type == nir_deref_type_var || |
| deref->deref_type == nir_deref_type_array); |
| |
| b->cursor = nir_before_instr(instr); |
| if (!new_var) { |
| /* Update lengths for new and old vars */ |
| int old_length = glsl_array_size(var->type); |
| int new_length = (old_length + var->data.location_frac) - 4; |
| old_length -= new_length; |
| |
| /* The existing variable fits in the float4 */ |
| if (new_length <= 0) |
| return false; |
| |
| new_var = nir_variable_clone(var, params->shader); |
| nir_shader_add_variable(params->shader, new_var); |
| assert(glsl_get_base_type(glsl_get_array_element(var->type)) == GLSL_TYPE_FLOAT); |
| var->type = glsl_array_type(glsl_float_type(), old_length, 0); |
| new_var->type = glsl_array_type(glsl_float_type(), new_length, 0); |
| new_var->data.location++; |
| new_var->data.location_frac = 0; |
| params->new_var = new_var; |
| } |
| |
| /* Update the type for derefs of the old var */ |
| if (deref->deref_type == nir_deref_type_var) { |
| deref->type = var->type; |
| return false; |
| } |
| |
| nir_const_value *index = nir_src_as_const_value(deref->arr.index); |
| assert(index); |
| |
| /* Treat this array as a vector starting at the component index in location_frac, |
| * so if location_frac is 1 and index is 0, then it's accessing the 'y' component |
| * of the vector. If index + location_frac is >= 4, there's no component there, |
| * so we need to add a new variable and adjust the index. |
| */ |
| unsigned total_index = index->u32 + var->data.location_frac; |
| if (total_index < 4) |
| return false; |
| |
| nir_deref_instr *new_var_deref = nir_build_deref_var(b, new_var); |
| nir_deref_instr *new_array_deref = nir_build_deref_array(b, new_var_deref, nir_imm_int(b, total_index % 4)); |
| nir_ssa_def_rewrite_uses(&deref->dest.ssa, &new_array_deref->dest.ssa); |
| return true; |
| } |
| |
| bool |
| dxil_nir_split_clip_cull_distance(nir_shader *shader) |
| { |
| struct dxil_nir_split_clip_cull_distance_params params = { |
| .new_var = NULL, |
| .shader = shader, |
| }; |
| nir_shader_instructions_pass(shader, |
| dxil_nir_split_clip_cull_distance_instr, |
| nir_metadata_block_index | |
| nir_metadata_dominance | |
| nir_metadata_loop_analysis, |
| ¶ms); |
| return params.new_var != NULL; |
| } |
| |
| static bool |
| dxil_nir_lower_double_math_instr(nir_builder *b, |
| nir_instr *instr, |
| UNUSED void *cb_data) |
| { |
| if (instr->type != nir_instr_type_alu) |
| return false; |
| |
| nir_alu_instr *alu = nir_instr_as_alu(instr); |
| |
| /* TODO: See if we can apply this explicitly to packs/unpacks that are then |
| * used as a double. As-is, if we had an app explicitly do a 64bit integer op, |
| * then try to bitcast to double (not expressible in HLSL, but it is in other |
| * source languages), this would unpack the integer and repack as a double, when |
| * we probably want to just send the bitcast through to the backend. |
| */ |
| |
| b->cursor = nir_before_instr(&alu->instr); |
| |
| bool progress = false; |
| for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; ++i) { |
| if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float && |
| alu->src[i].src.ssa->bit_size == 64) { |
| nir_ssa_def *packed_double = nir_channel(b, alu->src[i].src.ssa, alu->src[i].swizzle[0]); |
| nir_ssa_def *unpacked_double = nir_unpack_64_2x32(b, packed_double); |
| nir_ssa_def *repacked_double = nir_pack_double_2x32_dxil(b, unpacked_double); |
| nir_instr_rewrite_src_ssa(instr, &alu->src[i].src, repacked_double); |
| memset(alu->src[i].swizzle, 0, ARRAY_SIZE(alu->src[i].swizzle)); |
| progress = true; |
| } |
| } |
| |
| if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float && |
| alu->dest.dest.ssa.bit_size == 64) { |
| b->cursor = nir_after_instr(&alu->instr); |
| nir_ssa_def *packed_double = &alu->dest.dest.ssa; |
| nir_ssa_def *unpacked_double = nir_unpack_double_2x32_dxil(b, packed_double); |
| nir_ssa_def *repacked_double = nir_pack_64_2x32(b, unpacked_double); |
| nir_ssa_def_rewrite_uses_after(packed_double, repacked_double, unpacked_double->parent_instr); |
| progress = true; |
| } |
| |
| return progress; |
| } |
| |
| bool |
| dxil_nir_lower_double_math(nir_shader *shader) |
| { |
| return nir_shader_instructions_pass(shader, |
| dxil_nir_lower_double_math_instr, |
| nir_metadata_block_index | |
| nir_metadata_dominance | |
| nir_metadata_loop_analysis, |
| NULL); |
| } |
| |
| typedef struct { |
| gl_system_value *values; |
| uint32_t count; |
| } zero_system_values_state; |
| |
| static bool |
| lower_system_value_to_zero_filter(const nir_instr* instr, const void* cb_state) |
| { |
| if (instr->type != nir_instr_type_intrinsic) { |
| return false; |
| } |
| |
| nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr); |
| |
| /* All the intrinsics we care about are loads */ |
| if (!nir_intrinsic_infos[intrin->intrinsic].has_dest) |
| return false; |
| |
| assert(intrin->dest.is_ssa); |
| |
| zero_system_values_state* state = (zero_system_values_state*)cb_state; |
| for (uint32_t i = 0; i < state->count; ++i) { |
| gl_system_value value = state->values[i]; |
| nir_intrinsic_op value_op = nir_intrinsic_from_system_value(value); |
| |
| if (intrin->intrinsic == value_op) { |
| return true; |
| } else if (intrin->intrinsic == nir_intrinsic_load_deref) { |
| nir_deref_instr* deref = nir_src_as_deref(intrin->src[0]); |
| if (!nir_deref_mode_is(deref, nir_var_system_value)) |
| return false; |
| |
| nir_variable* var = deref->var; |
| if (var->data.location == value) { |
| return true; |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| static nir_ssa_def* |
| lower_system_value_to_zero_instr(nir_builder* b, nir_instr* instr, void* _state) |
| { |
| return nir_imm_int(b, 0); |
| } |
| |
| bool |
| dxil_nir_lower_system_values_to_zero(nir_shader* shader, |
| gl_system_value* system_values, |
| uint32_t count) |
| { |
| zero_system_values_state state = { system_values, count }; |
| return nir_shader_lower_instructions(shader, |
| lower_system_value_to_zero_filter, |
| lower_system_value_to_zero_instr, |
| &state); |
| } |
| |
| static const struct glsl_type * |
| get_bare_samplers_for_type(const struct glsl_type *type) |
| { |
| if (glsl_type_is_sampler(type)) { |
| if (glsl_sampler_type_is_shadow(type)) |
| return glsl_bare_shadow_sampler_type(); |
| else |
| return glsl_bare_sampler_type(); |
| } else if (glsl_type_is_array(type)) { |
| return glsl_array_type( |
| get_bare_samplers_for_type(glsl_get_array_element(type)), |
| glsl_get_length(type), |
| 0 /*explicit size*/); |
| } |
| assert(!"Unexpected type"); |
| return NULL; |
| } |
| |
| static bool |
| redirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data) |
| { |
| if (instr->type != nir_instr_type_tex) |
| return false; |
| |
| nir_tex_instr *tex = nir_instr_as_tex(instr); |
| if (!nir_tex_instr_need_sampler(tex)) |
| return false; |
| |
| int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); |
| if (sampler_idx == -1) { |
| /* No derefs, must be using indices */ |
| nir_variable *bare_sampler = _mesa_hash_table_u64_search(data, tex->sampler_index); |
| |
| /* Already have a bare sampler here */ |
| if (bare_sampler) |
| return false; |
| |
| nir_variable *typed_sampler = NULL; |
| nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) { |
| if (var->data.binding <= tex->sampler_index && |
| var->data.binding + glsl_type_get_sampler_count(var->type) > tex->sampler_index) { |
| /* Already have a bare sampler for this binding, add it to the table */ |
| if (glsl_get_sampler_result_type(glsl_without_array(var->type)) == GLSL_TYPE_VOID) { |
| _mesa_hash_table_u64_insert(data, tex->sampler_index, var); |
| return false; |
| } |
| |
| typed_sampler = var; |
| } |
| } |
| |
| /* Clone the typed sampler to a bare sampler and we're done */ |
| assert(typed_sampler); |
| bare_sampler = nir_variable_clone(typed_sampler, b->shader); |
| bare_sampler->type = get_bare_samplers_for_type(typed_sampler->type); |
| nir_shader_add_variable(b->shader, bare_sampler); |
| _mesa_hash_table_u64_insert(data, tex->sampler_index, bare_sampler); |
| return true; |
| } |
| |
| /* Using derefs, means we have to rewrite the deref chain in addition to cloning */ |
| nir_deref_instr *final_deref = nir_src_as_deref(tex->src[sampler_idx].src); |
| nir_deref_path path; |
| nir_deref_path_init(&path, final_deref, NULL); |
| |
| nir_deref_instr *old_tail = path.path[0]; |
| assert(old_tail->deref_type == nir_deref_type_var); |
| nir_variable *old_var = old_tail->var; |
| if (glsl_get_sampler_result_type(glsl_without_array(old_var->type)) == GLSL_TYPE_VOID) { |
| nir_deref_path_finish(&path); |
| return false; |
| } |
| |
| nir_variable *new_var = _mesa_hash_table_u64_search(data, old_var->data.binding); |
| if (!new_var) { |
| new_var = nir_variable_clone(old_var, b->shader); |
| new_var->type = get_bare_samplers_for_type(old_var->type); |
| nir_shader_add_variable(b->shader, new_var); |
| _mesa_hash_table_u64_insert(data, old_var->data.binding, new_var); |
| } |
| |
| b->cursor = nir_after_instr(&old_tail->instr); |
| nir_deref_instr *new_tail = nir_build_deref_var(b, new_var); |
| |
| for (unsigned i = 1; path.path[i]; ++i) { |
| b->cursor = nir_after_instr(&path.path[i]->instr); |
| new_tail = nir_build_deref_follower(b, new_tail, path.path[i]); |
| } |
| |
| nir_deref_path_finish(&path); |
| nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[sampler_idx].src, &new_tail->dest.ssa); |
| |
| return true; |
| } |
| |
| bool |
| dxil_nir_create_bare_samplers(nir_shader *nir) |
| { |
| struct hash_table_u64 *sampler_to_bare = _mesa_hash_table_u64_create(NULL); |
| |
| bool progress = nir_shader_instructions_pass(nir, redirect_sampler_derefs, |
| nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, sampler_to_bare); |
| |
| _mesa_hash_table_u64_destroy(sampler_to_bare); |
| return progress; |
| } |
| |
| |
| static bool |
| lower_bool_input_filter(const nir_instr *instr, |
| UNUSED const void *_options) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| if (intr->intrinsic == nir_intrinsic_load_front_face) |
| return true; |
| |
| if (intr->intrinsic == nir_intrinsic_load_deref) { |
| nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); |
| nir_variable *var = nir_deref_instr_get_variable(deref); |
| return var->data.mode == nir_var_shader_in && |
| glsl_get_base_type(var->type) == GLSL_TYPE_BOOL; |
| } |
| |
| return false; |
| } |
| |
| static nir_ssa_def * |
| lower_bool_input_impl(nir_builder *b, nir_instr *instr, |
| UNUSED void *_options) |
| { |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| if (intr->intrinsic == nir_intrinsic_load_deref) { |
| nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); |
| nir_variable *var = nir_deref_instr_get_variable(deref); |
| |
| /* rewrite var->type */ |
| var->type = glsl_vector_type(GLSL_TYPE_UINT, |
| glsl_get_vector_elements(var->type)); |
| deref->type = var->type; |
| } |
| |
| intr->dest.ssa.bit_size = 32; |
| return nir_i2b1(b, &intr->dest.ssa); |
| } |
| |
| bool |
| dxil_nir_lower_bool_input(struct nir_shader *s) |
| { |
| return nir_shader_lower_instructions(s, lower_bool_input_filter, |
| lower_bool_input_impl, NULL); |
| } |
| |
| /* Comparison function to sort io values so that first come normal varyings, |
| * then system values, and then system generated values. |
| */ |
| static int |
| variable_location_cmp(const nir_variable* a, const nir_variable* b) |
| { |
| // Sort by driver_location, location, then index |
| return a->data.driver_location != b->data.driver_location ? |
| a->data.driver_location - b->data.driver_location : |
| a->data.location != b->data.location ? |
| a->data.location - b->data.location : |
| a->data.index - b->data.index; |
| } |
| |
| /* Order varyings according to driver location */ |
| uint64_t |
| dxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes) |
| { |
| nir_sort_variables_with_modes(s, variable_location_cmp, modes); |
| |
| uint64_t result = 0; |
| nir_foreach_variable_with_modes(var, s, modes) { |
| result |= 1ull << var->data.location; |
| } |
| return result; |
| } |
| |
| /* Sort PS outputs so that color outputs come first */ |
| void |
| dxil_sort_ps_outputs(nir_shader* s) |
| { |
| nir_foreach_variable_with_modes_safe(var, s, nir_var_shader_out) { |
| /* We use the driver_location here to avoid introducing a new |
| * struct or member variable here. The true, updated driver location |
| * will be written below, after sorting */ |
| switch (var->data.location) { |
| case FRAG_RESULT_DEPTH: |
| var->data.driver_location = 1; |
| break; |
| case FRAG_RESULT_STENCIL: |
| var->data.driver_location = 2; |
| break; |
| case FRAG_RESULT_SAMPLE_MASK: |
| var->data.driver_location = 3; |
| break; |
| default: |
| var->data.driver_location = 0; |
| } |
| } |
| |
| nir_sort_variables_with_modes(s, variable_location_cmp, |
| nir_var_shader_out); |
| |
| unsigned driver_loc = 0; |
| nir_foreach_variable_with_modes(var, s, nir_var_shader_out) { |
| var->data.driver_location = driver_loc++; |
| } |
| } |
| |
| /* Order between stage values so that normal varyings come first, |
| * then sysvalues and then system generated values. |
| */ |
| uint64_t |
| dxil_reassign_driver_locations(nir_shader* s, nir_variable_mode modes, |
| uint64_t other_stage_mask) |
| { |
| nir_foreach_variable_with_modes_safe(var, s, modes) { |
| /* We use the driver_location here to avoid introducing a new |
| * struct or member variable here. The true, updated driver location |
| * will be written below, after sorting */ |
| var->data.driver_location = nir_var_to_dxil_sysvalue_type(var, other_stage_mask); |
| } |
| |
| nir_sort_variables_with_modes(s, variable_location_cmp, modes); |
| |
| uint64_t result = 0; |
| unsigned driver_loc = 0; |
| nir_foreach_variable_with_modes(var, s, modes) { |
| result |= 1ull << var->data.location; |
| var->data.driver_location = driver_loc++; |
| } |
| return result; |
| } |