| /* |
| * Copyright © 2018 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "nir_xfb_info.h" |
| |
| #include "util/u_dynarray.h" |
| #include <util/u_math.h> |
| |
| static void |
| add_var_xfb_varying(nir_xfb_info *xfb, |
| nir_xfb_varyings_info *varyings, |
| unsigned buffer, |
| unsigned offset, |
| const struct glsl_type *type) |
| { |
| if (varyings == NULL) |
| return; |
| |
| nir_xfb_varying_info *varying = &varyings->varyings[varyings->varying_count++]; |
| |
| varying->type = type; |
| varying->buffer = buffer; |
| varying->offset = offset; |
| xfb->buffers[buffer].varying_count++; |
| } |
| |
| |
| static nir_xfb_info * |
| nir_xfb_info_create(void *mem_ctx, uint16_t output_count) |
| { |
| return rzalloc_size(mem_ctx, nir_xfb_info_size(output_count)); |
| } |
| |
| static size_t |
| nir_xfb_varyings_info_size(uint16_t varying_count) |
| { |
| return sizeof(nir_xfb_info) + sizeof(nir_xfb_varying_info) * varying_count; |
| } |
| |
| static nir_xfb_varyings_info * |
| nir_xfb_varyings_info_create(void *mem_ctx, uint16_t varying_count) |
| { |
| return rzalloc_size(mem_ctx, nir_xfb_varyings_info_size(varying_count)); |
| } |
| |
| static void |
| add_var_xfb_outputs(nir_xfb_info *xfb, |
| nir_xfb_varyings_info *varyings, |
| nir_variable *var, |
| unsigned buffer, |
| unsigned *location, |
| unsigned *offset, |
| const struct glsl_type *type, |
| bool varying_added) |
| { |
| /* If this type contains a 64-bit value, align to 8 bytes */ |
| if (glsl_type_contains_64bit(type)) |
| *offset = ALIGN_POT(*offset, 8); |
| |
| if (glsl_type_is_array_or_matrix(type) && !var->data.compact) { |
| unsigned length = glsl_get_length(type); |
| |
| const struct glsl_type *child_type = glsl_get_array_element(type); |
| if (!glsl_type_is_array(child_type) && |
| !glsl_type_is_struct(child_type)) { |
| |
| add_var_xfb_varying(xfb, varyings, buffer, *offset, type); |
| varying_added = true; |
| } |
| |
| for (unsigned i = 0; i < length; i++) |
| add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset, |
| child_type, varying_added); |
| } else if (glsl_type_is_struct_or_ifc(type)) { |
| unsigned length = glsl_get_length(type); |
| for (unsigned i = 0; i < length; i++) { |
| const struct glsl_type *child_type = glsl_get_struct_field(type, i); |
| add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset, |
| child_type, varying_added); |
| } |
| } else { |
| assert(buffer < NIR_MAX_XFB_BUFFERS); |
| if (xfb->buffers_written & (1 << buffer)) { |
| assert(xfb->buffers[buffer].stride == var->data.xfb.stride); |
| assert(xfb->buffer_to_stream[buffer] == var->data.stream); |
| } else { |
| xfb->buffers_written |= (1 << buffer); |
| xfb->buffers[buffer].stride = var->data.xfb.stride; |
| xfb->buffer_to_stream[buffer] = var->data.stream; |
| } |
| |
| assert(var->data.stream < NIR_MAX_XFB_STREAMS); |
| xfb->streams_written |= (1 << var->data.stream); |
| |
| unsigned comp_slots; |
| if (var->data.compact) { |
| /* This only happens for clip/cull which are float arrays */ |
| assert(glsl_without_array(type) == glsl_float_type()); |
| assert(var->data.location == VARYING_SLOT_CLIP_DIST0 || |
| var->data.location == VARYING_SLOT_CLIP_DIST1); |
| comp_slots = glsl_get_length(type); |
| } else { |
| comp_slots = glsl_get_component_slots(type); |
| |
| UNUSED unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4); |
| assert(attrib_slots == glsl_count_attribute_slots(type, false)); |
| |
| /* Ensure that we don't have, for instance, a dvec2 with a |
| * location_frac of 2 which would make it crass a location boundary |
| * even though it fits in a single slot. However, you can have a |
| * dvec3 which crosses the slot boundary with a location_frac of 2. |
| */ |
| assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) == |
| attrib_slots); |
| } |
| |
| assert(var->data.location_frac + comp_slots <= 8); |
| uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac; |
| unsigned comp_offset = var->data.location_frac; |
| |
| if (!varying_added) { |
| add_var_xfb_varying(xfb, varyings, buffer, *offset, type); |
| } |
| |
| while (comp_mask) { |
| nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++]; |
| |
| output->buffer = buffer; |
| output->offset = *offset; |
| output->location = *location; |
| output->component_mask = comp_mask & 0xf; |
| output->component_offset = comp_offset; |
| |
| *offset += util_bitcount(output->component_mask) * 4; |
| (*location)++; |
| comp_mask >>= 4; |
| comp_offset = 0; |
| } |
| } |
| } |
| |
| static int |
| compare_xfb_varying_offsets(const void *_a, const void *_b) |
| { |
| const nir_xfb_varying_info *a = _a, *b = _b; |
| |
| if (a->buffer != b->buffer) |
| return a->buffer - b->buffer; |
| |
| return a->offset - b->offset; |
| } |
| |
| static int |
| compare_xfb_output_offsets(const void *_a, const void *_b) |
| { |
| const nir_xfb_output_info *a = _a, *b = _b; |
| |
| return a->offset - b->offset; |
| } |
| |
| void |
| nir_shader_gather_xfb_info(nir_shader *shader) |
| { |
| nir_gather_xfb_info_with_varyings(shader, NULL, NULL); |
| } |
| |
| void |
| nir_gather_xfb_info_with_varyings(nir_shader *shader, |
| void *mem_ctx, |
| nir_xfb_varyings_info **varyings_info_out) |
| { |
| assert(shader->info.stage == MESA_SHADER_VERTEX || |
| shader->info.stage == MESA_SHADER_TESS_EVAL || |
| shader->info.stage == MESA_SHADER_GEOMETRY); |
| |
| /* Compute the number of outputs we have. This is simply the number of |
| * cumulative locations consumed by all the variables. If a location is |
| * represented by multiple variables, then they each count separately in |
| * number of outputs. This is only an estimate as some variables may have |
| * an xfb_buffer but not an output so it may end up larger than we need but |
| * it should be good enough for allocation. |
| */ |
| unsigned num_outputs = 0; |
| unsigned num_varyings = 0; |
| nir_xfb_varyings_info *varyings_info = NULL; |
| nir_foreach_shader_out_variable(var, shader) { |
| if (var->data.explicit_xfb_buffer) { |
| num_outputs += glsl_count_attribute_slots(var->type, false); |
| num_varyings += glsl_varying_count(var->type); |
| } |
| } |
| if (num_outputs == 0 || num_varyings == 0) |
| return; |
| |
| nir_xfb_info *xfb = nir_xfb_info_create(shader, num_outputs); |
| if (varyings_info_out != NULL) { |
| *varyings_info_out = nir_xfb_varyings_info_create(mem_ctx, num_varyings); |
| varyings_info = *varyings_info_out; |
| } |
| |
| /* Walk the list of outputs and add them to the array */ |
| nir_foreach_shader_out_variable(var, shader) { |
| if (!var->data.explicit_xfb_buffer) |
| continue; |
| |
| unsigned location = var->data.location; |
| |
| /* In order to know if we have a array of blocks can't be done just by |
| * checking if we have an interface type and is an array, because due |
| * splitting we could end on a case were we received a split struct |
| * that contains an array. |
| */ |
| bool is_array_block = var->interface_type != NULL && |
| glsl_type_is_array(var->type) && |
| glsl_without_array(var->type) == var->interface_type; |
| |
| if (var->data.explicit_offset && !is_array_block) { |
| unsigned offset = var->data.offset; |
| add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer, |
| &location, &offset, var->type, false); |
| } else if (is_array_block) { |
| assert(glsl_type_is_struct_or_ifc(var->interface_type)); |
| |
| unsigned aoa_size = glsl_get_aoa_size(var->type); |
| const struct glsl_type *itype = var->interface_type; |
| unsigned nfields = glsl_get_length(itype); |
| for (unsigned b = 0; b < aoa_size; b++) { |
| for (unsigned f = 0; f < nfields; f++) { |
| int foffset = glsl_get_struct_field_offset(itype, f); |
| const struct glsl_type *ftype = glsl_get_struct_field(itype, f); |
| if (foffset < 0) { |
| location += glsl_count_attribute_slots(ftype, false); |
| continue; |
| } |
| |
| unsigned offset = foffset; |
| add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer + b, |
| &location, &offset, ftype, false); |
| } |
| } |
| } |
| } |
| |
| /* Everything is easier in the state setup code if outputs and varyings are |
| * sorted in order of output offset (and buffer for varyings). |
| */ |
| qsort(xfb->outputs, xfb->output_count, sizeof(xfb->outputs[0]), |
| compare_xfb_output_offsets); |
| |
| if (varyings_info != NULL) { |
| qsort(varyings_info->varyings, varyings_info->varying_count, |
| sizeof(varyings_info->varyings[0]), |
| compare_xfb_varying_offsets); |
| } |
| |
| #ifndef NDEBUG |
| /* Finally, do a sanity check */ |
| unsigned max_offset[NIR_MAX_XFB_BUFFERS] = {0}; |
| for (unsigned i = 0; i < xfb->output_count; i++) { |
| assert(xfb->outputs[i].offset >= max_offset[xfb->outputs[i].buffer]); |
| assert(xfb->outputs[i].component_mask != 0); |
| unsigned slots = util_bitcount(xfb->outputs[i].component_mask); |
| max_offset[xfb->outputs[i].buffer] = xfb->outputs[i].offset + slots * 4; |
| } |
| #endif |
| |
| ralloc_free(shader->xfb_info); |
| shader->xfb_info = xfb; |
| } |
| |
| static int |
| get_xfb_out_sort_index(const nir_xfb_output_info *a) |
| { |
| /* Return the maximum number to put dummy components at the end. */ |
| if (!a->component_mask) |
| return MAX_XFB_BUFFERS << 26; |
| |
| return ((uint32_t)a->buffer << 26) | /* 2 bits for the buffer */ |
| /* 10 bits for the component location (256 * 4) */ |
| (((uint32_t)a->location * 4 + a->component_offset) << 16) | |
| /* 16 bits for the offset */ |
| a->offset; |
| } |
| |
| static int |
| compare_xfb_out(const void *pa, const void *pb) |
| { |
| const nir_xfb_output_info *a = (const nir_xfb_output_info *)pa; |
| const nir_xfb_output_info *b = (const nir_xfb_output_info *)pb; |
| |
| return get_xfb_out_sort_index(a) - get_xfb_out_sort_index(b); |
| } |
| |
| /** |
| * Gather transform feedback info from lowered IO intrinsics. |
| * |
| * Optionally return slot_to_register, an optional table to translate |
| * gl_varying_slot to "base" indices. |
| */ |
| nir_xfb_info * |
| nir_gather_xfb_info_from_intrinsics(nir_shader *nir, |
| int slot_to_register[NUM_TOTAL_VARYING_SLOTS]) |
| { |
| nir_function_impl *impl = nir_shader_get_entrypoint(nir); |
| uint8_t buffer_to_stream[MAX_XFB_BUFFERS] = {0}; |
| uint8_t buffer_mask = 0; |
| uint8_t stream_mask = 0; |
| |
| if (slot_to_register) { |
| memset(slot_to_register, -1, |
| sizeof(slot_to_register[0] * NUM_TOTAL_VARYING_SLOTS)); |
| } |
| |
| /* Gather xfb outputs. */ |
| struct util_dynarray array = {0}; |
| |
| nir_foreach_block(block, impl) { |
| nir_foreach_instr(instr, block) { |
| if (instr->type != nir_instr_type_intrinsic || |
| !nir_instr_xfb_write_mask(nir_instr_as_intrinsic(instr))) |
| continue; |
| |
| nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); |
| |
| unsigned wr_mask = nir_intrinsic_write_mask(intr); |
| |
| while (wr_mask) { |
| unsigned i = u_bit_scan(&wr_mask); |
| unsigned index = nir_intrinsic_component(intr) + i; |
| nir_io_xfb xfb = index < 2 ? nir_intrinsic_io_xfb(intr) : |
| nir_intrinsic_io_xfb2(intr); |
| |
| if (xfb.out[index % 2].num_components) { |
| nir_io_semantics sem = nir_intrinsic_io_semantics(intr); |
| nir_xfb_output_info out; |
| |
| out.component_offset = index; |
| out.component_mask = |
| BITFIELD_RANGE(index, xfb.out[index % 2].num_components); |
| out.location = sem.location; |
| out.buffer = xfb.out[index % 2].buffer; |
| out.offset = (uint32_t)xfb.out[index % 2].offset * 4; |
| util_dynarray_append(&array, nir_xfb_output_info, out); |
| |
| uint8_t stream = (sem.gs_streams >> (i * 2)) & 0x3; |
| buffer_to_stream[out.buffer] = stream; |
| buffer_mask |= BITFIELD_BIT(out.buffer); |
| stream_mask |= BITFIELD_BIT(stream); |
| |
| if (slot_to_register) |
| slot_to_register[sem.location] = nir_intrinsic_base(intr); |
| |
| /* No elements before component_offset are allowed to be set. */ |
| assert(!(out.component_mask & BITFIELD_MASK(out.component_offset))); |
| } |
| } |
| } |
| } |
| |
| nir_xfb_output_info *outputs = (nir_xfb_output_info *)array.data; |
| int count = util_dynarray_num_elements(&array, nir_xfb_output_info); |
| |
| if (!count) |
| return NULL; |
| |
| if (count > 1) { |
| /* Sort outputs by buffer, location, and component. */ |
| qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out); |
| |
| /* Merge outputs referencing the same slot. */ |
| for (int i = 0; i < count - 1; i++) { |
| nir_xfb_output_info *cur = &outputs[i]; |
| |
| if (!cur->component_mask) |
| continue; |
| |
| /* Outputs referencing the same buffer and location are contiguous. */ |
| for (int j = i + 1; |
| j < count && |
| cur->buffer == outputs[j].buffer && |
| cur->location == outputs[j].location; j++) { |
| if (outputs[j].component_mask && |
| outputs[j].offset - outputs[j].component_offset * 4 == |
| cur->offset - cur->component_offset * 4) { |
| unsigned merged_offset = MIN2(cur->component_offset, |
| outputs[j].component_offset); |
| /* component_mask is relative to 0, not component_offset */ |
| unsigned merged_mask = cur->component_mask | outputs[j].component_mask; |
| |
| /* The component mask should have no holes after merging. */ |
| if (util_is_power_of_two_nonzero((merged_mask >> merged_offset) + 1)) { |
| /* Merge outputs. */ |
| cur->component_offset = merged_offset; |
| cur->component_mask = merged_mask; |
| cur->offset = (uint32_t)cur->offset - |
| (uint32_t)cur->component_offset * 4 + |
| (uint32_t)merged_offset * 4; |
| /* Disable the other output. */ |
| outputs[j].component_mask = 0; |
| } |
| } |
| } |
| } |
| |
| /* Sort outputs again to put disabled outputs at the end. */ |
| qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out); |
| |
| /* Remove disabled outputs. */ |
| for (int i = count - 1; i >= 0 && !outputs[i].component_mask; i--) |
| count = i; |
| } |
| |
| for (unsigned i = 0; i < count; i++) |
| assert(outputs[i].component_mask); |
| |
| /* Create nir_xfb_info. */ |
| nir_xfb_info *info = calloc(1, nir_xfb_info_size(count)); |
| if (!info) { |
| util_dynarray_fini(&array); |
| return NULL; |
| } |
| |
| /* Fill nir_xfb_info. */ |
| info->buffers_written = buffer_mask; |
| info->streams_written = stream_mask; |
| memcpy(info->buffer_to_stream, buffer_to_stream, sizeof(buffer_to_stream)); |
| info->output_count = count; |
| memcpy(info->outputs, outputs, count * sizeof(outputs[0])); |
| |
| /* Set strides. */ |
| for (unsigned i = 0; i < MAX_XFB_BUFFERS; i++) { |
| if (buffer_mask & BITFIELD_BIT(i)) |
| info->buffers[i].stride = nir->info.xfb_stride[i]; |
| } |
| |
| /* Set varying_count. */ |
| for (unsigned i = 0; i < count; i++) |
| info->buffers[outputs[i].buffer].varying_count++; |
| |
| util_dynarray_fini(&array); |
| return info; |
| } |
| |
| void |
| nir_print_xfb_info(nir_xfb_info *info, FILE *fp) |
| { |
| fprintf(fp, "buffers_written: 0x%x\n", info->buffers_written); |
| fprintf(fp, "streams_written: 0x%x\n", info->streams_written); |
| |
| for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) { |
| if (BITFIELD_BIT(i) & info->buffers_written) { |
| fprintf(fp, "buffer%u: stride=%u varying_count=%u stream=%u\n", i, |
| info->buffers[i].stride, |
| info->buffers[i].varying_count, |
| info->buffer_to_stream[i]); |
| } |
| } |
| |
| fprintf(fp, "output_count: %u\n", info->output_count); |
| |
| for (unsigned i = 0; i < info->output_count; i++) { |
| fprintf(fp, "output%u: buffer=%u, offset=%u, location=%u, " |
| "component_offset=%u, component_mask=0x%x\n", |
| i, info->outputs[i].buffer, |
| info->outputs[i].offset, |
| info->outputs[i].location, |
| info->outputs[i].component_offset, |
| info->outputs[i].component_mask); |
| } |
| } |