| /* |
| * Copyright 2024 Alyssa Rosenzweig |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| #include "agx_linker.h" |
| #include <stddef.h> |
| #include <stdint.h> |
| #include "util/ralloc.h" |
| #include "agx_compile.h" |
| #include "agx_device.h" |
| #include "agx_pack.h" |
| #include "agx_scratch.h" |
| |
| /* |
| * When sample shading is used with a non-monolithic fragment shader, we |
| * fast-link a program with the following structure: |
| * |
| * Fragment prolog; |
| * |
| * for (u16 sample_bit = 1; sample_bit < (1 << # of samples); ++sample_bit) { |
| * API fragment shader; |
| * Fragment epilog; |
| * } |
| * |
| * This means the prolog runs per-pixel but the fragment shader and epilog run |
| * per-sample. To do this, we need to generate the loop on the fly. The |
| * following binary sequences form the relevant loop. |
| */ |
| |
| /* clang-format off */ |
| static const uint8_t sample_loop_header[] = { |
| /* mov_imm r0, 0x10000, 0b0 */ |
| 0x62, 0x01, 0x00, 0x00, 0x01, 0x00, |
| }; |
| |
| #define STOP \ |
| /* stop */ \ |
| 0x88, 0x00, \ |
| \ |
| /* trap */ \ |
| 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, \ |
| 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, |
| |
| static const uint8_t stop[] = {STOP}; |
| |
| static const uint8_t sample_loop_footer[] = { |
| /* iadd r0h, 0, r0h, lsl 1 */ |
| 0x0e, 0x02, 0x00, 0x10, 0x84, 0x00, 0x00, 0x00, |
| |
| /* while_icmp r0l, ult, r0h, 0, 1 */ |
| 0x52, 0x2c, 0x41, 0x00, 0x00, 0x00, |
| |
| /* jmp_exec_any */ |
| 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, |
| |
| /* pop_exec r0l, 1 */ |
| 0x52, 0x0e, 0x00, 0x00, 0x00, 0x00, |
| |
| STOP |
| }; |
| |
| /* Offset in sample_loop_footer to the jmp_exec_any's target */ |
| #define SAMPLE_LOOP_FOOTER_JMP_PATCH_OFFS (16) |
| |
| /* Offset of the jmp_exec_any, for calculating the PC offsets */ |
| #define SAMPLE_LOOP_FOOTER_JMP_OFFS (14) |
| |
| /* Offset in sample_loop_footer to the while_icmp's sample count immediate. Bit |
| * position in the byte given by the shift. |
| */ |
| #define SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS (11) |
| #define SAMPLE_LOOP_FOOTER_COUNT_SHIFT (4) |
| /* clang-format on */ |
| |
| void |
| agx_fast_link(struct agx_linked_shader *linked, struct agx_device *dev, |
| bool fragment, struct agx_shader_part *main, |
| struct agx_shader_part *prolog, struct agx_shader_part *epilog, |
| unsigned nr_samples_shaded) |
| { |
| size_t size = 0; |
| unsigned nr_gprs = 0, scratch_size = 0; |
| bool reads_tib = false, writes_sample_mask = false, |
| disable_tri_merging = false, tag_write_disable = true; |
| |
| if (nr_samples_shaded) { |
| size += sizeof(sample_loop_header); |
| |
| if (nr_samples_shaded > 1) |
| size += sizeof(sample_loop_footer); |
| else |
| size += sizeof(stop); |
| } |
| |
| struct agx_shader_part *parts[] = {prolog, main, epilog}; |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(parts); ++i) { |
| struct agx_shader_part *part = parts[i]; |
| if (!part) |
| continue; |
| |
| assert(part->info.main_offset == 0); |
| size += part->info.main_size; |
| |
| nr_gprs = MAX2(nr_gprs, part->info.nr_gprs); |
| scratch_size = MAX2(scratch_size, part->info.scratch_size); |
| reads_tib |= part->info.reads_tib; |
| writes_sample_mask |= part->info.writes_sample_mask; |
| disable_tri_merging |= part->info.disable_tri_merging; |
| linked->uses_base_param |= part->info.uses_base_param; |
| linked->uses_txf |= part->info.uses_txf; |
| tag_write_disable &= part->info.tag_write_disable; |
| } |
| |
| assert(size > 0 && "must stop"); |
| |
| linked->bo = agx_bo_create(dev, size, 0, AGX_BO_EXEC | AGX_BO_LOW_VA, |
| "Linked executable"); |
| |
| size_t offset = 0; |
| |
| /* FS prolog happens per-pixel, outside the sample loop */ |
| if (prolog) { |
| size_t sz = prolog->info.main_size; |
| memcpy((uint8_t *)linked->bo->map + offset, prolog->binary, sz); |
| offset += sz; |
| } |
| |
| if (nr_samples_shaded) { |
| memcpy((uint8_t *)linked->bo->map + offset, sample_loop_header, |
| sizeof(sample_loop_header)); |
| offset += sizeof(sample_loop_header); |
| } |
| |
| size_t sample_loop_begin = offset; |
| |
| /* Main shader and epilog happen in the sample loop, so start from i=1 */ |
| for (unsigned i = 1; i < ARRAY_SIZE(parts); ++i) { |
| struct agx_shader_part *part = parts[i]; |
| if (!part) |
| continue; |
| |
| size_t sz = part->info.main_size; |
| memcpy((uint8_t *)linked->bo->map + offset, part->binary, sz); |
| offset += sz; |
| } |
| |
| if (nr_samples_shaded > 1) { |
| assert(sample_loop_footer[SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS] == 0); |
| |
| /* Make a stack copy of the footer so we can efficiently patch it */ |
| uint8_t footer[sizeof(sample_loop_footer)]; |
| memcpy(footer, sample_loop_footer, sizeof(footer)); |
| |
| /* Patch in sample end */ |
| uint8_t end = (1u << nr_samples_shaded) - 1; |
| footer[SAMPLE_LOOP_FOOTER_COUNT_PATCH_OFFS] = |
| end << SAMPLE_LOOP_FOOTER_COUNT_SHIFT; |
| |
| /* Patch in the branch target */ |
| int32_t loop_size = offset - sample_loop_begin; |
| int32_t branch_offs = -(SAMPLE_LOOP_FOOTER_JMP_OFFS + loop_size); |
| int32_t *target = (int32_t *)(footer + SAMPLE_LOOP_FOOTER_JMP_PATCH_OFFS); |
| *target = branch_offs; |
| |
| /* Copy in the patched footer */ |
| memcpy((uint8_t *)linked->bo->map + offset, footer, sizeof(footer)); |
| offset += sizeof(footer); |
| } else if (nr_samples_shaded) { |
| /* Just end after the first sample, no need to loop for a single sample */ |
| memcpy((uint8_t *)linked->bo->map + offset, stop, sizeof(stop)); |
| offset += sizeof(stop); |
| } |
| |
| assert(offset == size); |
| |
| agx_pack(&linked->shader, USC_SHADER, cfg) { |
| cfg.code = agx_usc_addr(dev, linked->bo->va->addr); |
| cfg.unk_2 = fragment ? 2 : 3; |
| |
| if (fragment) |
| cfg.loads_varyings = linked->cf.nr_bindings > 0; |
| } |
| |
| agx_pack(&linked->regs, USC_REGISTERS, cfg) { |
| cfg.register_count = nr_gprs; |
| cfg.unk_1 = fragment; |
| cfg.spill_size = scratch_size ? agx_scratch_get_bucket(scratch_size) : 0; |
| } |
| |
| if (fragment) { |
| agx_pack(&linked->fragment_props, USC_FRAGMENT_PROPERTIES, cfg) { |
| cfg.early_z_testing = !writes_sample_mask; |
| cfg.unk_4 = 0x2; |
| cfg.unk_5 = 0x0; |
| } |
| |
| agx_pack(&linked->fragment_control, FRAGMENT_CONTROL, cfg) { |
| cfg.tag_write_disable = tag_write_disable; |
| cfg.disable_tri_merging = disable_tri_merging; |
| |
| if (reads_tib && writes_sample_mask) |
| cfg.pass_type = AGX_PASS_TYPE_TRANSLUCENT_PUNCH_THROUGH; |
| else if (reads_tib) |
| cfg.pass_type = AGX_PASS_TYPE_TRANSLUCENT; |
| else if (writes_sample_mask) |
| cfg.pass_type = AGX_PASS_TYPE_PUNCH_THROUGH; |
| else |
| cfg.pass_type = AGX_PASS_TYPE_OPAQUE; |
| } |
| |
| /* Merge the CF binding lists from the prolog to handle cull distance */ |
| memcpy(&linked->cf, &main->info.varyings.fs, |
| sizeof(struct agx_varyings_fs)); |
| |
| struct agx_varyings_fs *prolog_vary = |
| prolog ? &prolog->info.varyings.fs : NULL; |
| |
| if (prolog_vary && prolog_vary->nr_bindings) { |
| assert(!prolog_vary->reads_z); |
| linked->cf.nr_cf = MAX2(linked->cf.nr_cf, prolog_vary->nr_cf); |
| |
| assert(linked->cf.nr_bindings + prolog_vary->nr_bindings <= |
| ARRAY_SIZE(linked->cf.bindings) && |
| "bounded by # of coeff registers"); |
| |
| memcpy(linked->cf.bindings + linked->cf.nr_bindings, |
| prolog_vary->bindings, |
| sizeof(struct agx_cf_binding) * prolog_vary->nr_bindings); |
| |
| linked->cf.nr_bindings += prolog_vary->nr_bindings; |
| } |
| |
| agx_pack(&linked->osel, OUTPUT_SELECT, cfg) { |
| cfg.varyings = linked->cf.nr_bindings > 0; |
| cfg.frag_coord_z = linked->cf.reads_z; |
| } |
| } |
| } |