src/freedreno/decode/rdcompiler-utils.h - third_party/mesa - Git at Google

 /*
  * Copyright © 2022 Igalia S.L.
  * SPDX-License-Identifier: MIT
  */

 #include <assert.h>
 #include <err.h>
 #include <getopt.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>

 #include "redump.h"

 #include "util/u_math.h"

 #include "adreno_common.xml.h"
 #include "adreno_pm4.xml.h"
 #include "freedreno_pm4.h"

 #include "a6xx.xml.h"

 #include "ir3/ir3_assembler.h"
 #include "ir3/ir3_compiler.h"
 #include "ir3/ir3_shader.h"

 #include "util/list.h"
 #include "util/vma.h"

 struct cmdstream {
    struct list_head link;

    uint32_t *mem;
    uint32_t total_size;
    uint32_t cur;

    uint64_t iova;
 };

 static uint64_t
 cs_get_cur_iova(struct cmdstream *cs)
 {
    return cs->iova + cs->cur * sizeof(uint32_t);
 }

 struct wrbuf {
    struct list_head link;

    uint64_t iova;
    uint64_t size;
    uint64_t clear;
    const char *name;
 };

 struct replay_context {
    void *mem_ctx;

    struct util_vma_heap vma;

    struct cmdstream *submit_cs;
    struct cmdstream *state_cs;
    struct cmdstream *shader_cs;

    struct cmdstream *shader_log;
    struct cmdstream *cp_log;

    struct list_head cs_list;

    struct list_head wrbuf_list;

    struct ir3_compiler *compiler;

    struct hash_table_u64 *compiled_shaders;

    const char *output_name;
 };

 static void
 pkt(struct cmdstream *cs, uint32_t payload)
 {
    assert(cs->cur <= cs->total_size);
    cs->mem[cs->cur++] = payload;
 }

 static void
 pkt_qw(struct cmdstream *cs, uint64_t payload)
 {
    pkt(cs, payload);
    pkt(cs, payload >> 32);
 }

 static uint64_t
 pkt_blob(struct cmdstream *cs, void *payload, uint32_t size, uint32_t alignment)
 {
    cs->cur = align(cs->cur, alignment / sizeof(uint32_t));
    uint64_t start_iova = cs_get_cur_iova(cs);

    memcpy(cs->mem + cs->cur, payload, size);
    cs->cur += size;

    return start_iova;
 }

 static void
 pkt4(struct cmdstream *cs, uint16_t regindx, uint16_t cnt, uint32_t payload)
 {
    pkt(cs, pm4_pkt4_hdr(regindx, cnt));
    pkt(cs, payload);
 }

 static void
 pkt7(struct cmdstream *cs, uint8_t opcode, uint16_t cnt)
 {
    pkt(cs, pm4_pkt7_hdr(opcode, cnt));
 }

 struct rd_section {
    uint32_t type;
    uint32_t size;
 };

 static struct cmdstream *
 cs_alloc(struct replay_context *ctx, uint32_t size)
 {
    struct cmdstream *cs = (struct cmdstream *) calloc(1, sizeof(struct cmdstream));
    cs->mem = (uint32_t *)calloc(1, size);
    cs->total_size = size / sizeof(uint32_t);
    cs->cur = 0;
    cs->iova = util_vma_heap_alloc(&ctx->vma, size, 4096);

    assert(cs->iova != 0);

    list_addtail(&cs->link, &ctx->cs_list);

    return cs;
 }

 static void
 rd_write_gpu_addr_section(FILE *out, struct cmdstream *cs, enum rd_sect_type section)
 {
    const uint32_t packet[] = {(uint32_t)cs->iova,
                               (uint32_t)(cs->cur * sizeof(uint32_t)),
                               (uint32_t)(cs->iova >> 32)};
    struct rd_section section_address = {.type = section,
                                         .size = sizeof(packet)};
    fwrite(&section_address, sizeof(section_address), 1, out);
    fwrite(packet, sizeof(packet), 1, out);
 }

 static void
 rd_write_cs_buffer(FILE *out, struct cmdstream *cs)
 {
    if (cs->cur == 0)
       return;

    rd_write_gpu_addr_section(out, cs, RD_GPUADDR);

    struct rd_section section_contents = {.type = RD_BUFFER_CONTENTS,
                                          .size = uint32_t(cs->cur * sizeof(uint32_t))};

    fwrite(&section_contents, sizeof(section_contents), 1, out);
    fwrite(cs->mem, sizeof(uint32_t), cs->cur, out);
 }

 static void
 rd_write_cs_submit(FILE *out, struct cmdstream *cs)
 {
    const uint32_t packet[] = {(uint32_t)cs->iova, cs->cur,
                               (uint32_t)(cs->iova >> 32)};
    struct rd_section section_cmdstream = {.type = RD_CMDSTREAM_ADDR,
                                           .size = sizeof(packet)};

    fwrite(&section_cmdstream, sizeof(section_cmdstream), 1, out);
    fwrite(packet, sizeof(packet), 1, out);
 }

 static void
 rd_write_wrbuffer(FILE *out, struct wrbuf *wrbuf)
 {
    uint32_t name_len = strlen(wrbuf->name) + 1;
    struct rd_section section = {.type = RD_WRBUFFER,
                                 .size = (uint32_t)(sizeof(uint64_t) * 3) + name_len};
    fwrite(&section, sizeof(section), 1, out);
    fwrite(&wrbuf->iova, sizeof(uint64_t), 1, out);
    fwrite(&wrbuf->size, sizeof(uint64_t), 1, out);
    fwrite(&wrbuf->clear, sizeof(uint64_t), 1, out);
    fwrite(wrbuf->name, sizeof(char), name_len, out);
 }

 static void
 print_usage(const char *name)
 {
    /* clang-format off */
    fprintf(stderr, "Usage:\n\n"
            "\t%s [OPTIONS]... FILE...\n\n"
            "Options:\n"
            "\t    --vastart=offset\n"
            "\t    --vasize=size\n"
            "\t-h, --help             - show this message\n"
            , name);
    /* clang-format on */
    exit(2);
 }

 #define OPT_VA_START 1000
 #define OPT_VA_SIZE  1001

 /* clang-format off */
 static const struct option opts[] = {
       { "vastart",  required_argument, 0, OPT_VA_START },
       { "vasize",   required_argument, 0, OPT_VA_SIZE },
       { "help",     no_argument,       0, 'h' },
 };
 /* clang-format on */

 static void
 replay_context_init(struct replay_context *ctx, struct fd_dev_id *dev_id,
                     int argc, char **argv)
 {
    uint64_t va_start = 0;
    uint64_t va_size = 0;

    int c;
    while ((c = getopt_long(argc, argv, "h", opts, NULL)) != -1) {
       switch (c) {
       case OPT_VA_START:
          va_start = strtoull(optarg, NULL, 0);
          break;
       case OPT_VA_SIZE:
          va_size = strtoull(optarg, NULL, 0);
          break;
       case 'h':
       default:
          print_usage(argv[0]);
       }
    }

    if (optind < argc) {
       ctx->output_name = argv[optind];
    } else {
    }

    if (!va_start || !va_size || !ctx->output_name) {
       print_usage(argv[0]);
       exit(1);
    }

    ctx->mem_ctx = ralloc_context(NULL);
    list_inithead(&ctx->cs_list);
    list_inithead(&ctx->wrbuf_list);

    util_vma_heap_init(&ctx->vma, va_start, ROUND_DOWN_TO(va_size, 4096));

    ctx->submit_cs = cs_alloc(ctx, 1024 * 1024);
    ctx->state_cs = cs_alloc(ctx, 2 * 1024 * 1024);
    ctx->shader_cs = cs_alloc(ctx, 8 * 1024 * 1024);

    ctx->shader_log = cs_alloc(ctx, 1024 * 1024);
    ctx->shader_log->mem[0] = (ctx->shader_log->iova & 0xffffffff) + sizeof(uint64_t);
    ctx->shader_log->mem[1] = ctx->shader_log->iova >> 32;
    ctx->shader_log->cur = ctx->shader_log->total_size;

    ctx->cp_log = cs_alloc(ctx, 8 * 1024 * 1024);
    ((uint64_t *)ctx->cp_log->mem)[0] = ctx->cp_log->iova + 2 * sizeof(uint64_t);
    ((uint64_t *)ctx->cp_log->mem)[1] = sizeof(uint64_t);
    ctx->cp_log->cur = ctx->cp_log->total_size;

    struct ir3_compiler_options options{
       .disable_cache = true,
    };
    ctx->compiler =
       ir3_compiler_create(NULL, dev_id, fd_dev_info_raw(dev_id), &options);
    ctx->compiled_shaders = _mesa_hash_table_u64_create(ctx->mem_ctx);
 }

 static void
 replay_context_finish(struct replay_context *ctx)
 {
    FILE *out = fopen(ctx->output_name, "w");
    if (!out) {
       errx(1, "Cannot open '%s' for writing\n", ctx->output_name);
    }

    static const uint32_t gpu_id = 660;
    struct rd_section section_gpu_id = {.type = RD_GPU_ID,
                                        .size = 1 * sizeof(uint32_t)};
    fwrite(&section_gpu_id, sizeof(section_gpu_id), 1, out);
    fwrite(&gpu_id, sizeof(uint32_t), 1, out);

    rd_write_gpu_addr_section(out, ctx->shader_log, RD_SHADER_LOG_BUFFER);
    rd_write_gpu_addr_section(out, ctx->cp_log, RD_CP_LOG_BUFFER);

    list_for_each_entry (struct cmdstream, cs, &ctx->cs_list, link) {
       rd_write_cs_buffer(out, cs);
    }
    rd_write_cs_submit(out, ctx->submit_cs);

    list_for_each_entry (struct wrbuf, wrbuf, &ctx->wrbuf_list, link) {
       rd_write_wrbuffer(out, wrbuf);
    }

    fclose(out);
 }

 static void
 upload_shader(struct replay_context *ctx, uint64_t id, const char *source)
 {
    FILE *in = fmemopen((void *)source, strlen(source), "r");

    struct ir3_kernel_info info = {
       .shader_print_buffer_iova = ctx->shader_log->iova,
    };
    struct ir3_shader *shader = ir3_parse_asm(ctx->compiler, &info, in);
    assert(shader);

    fclose(in);

    uint64_t *shader_iova = ralloc(ctx->mem_ctx, uint64_t);
    *shader_iova = pkt_blob(ctx->shader_cs, shader->variants->bin,
                            shader->variants->info.size, 128);
    ralloc_free(shader);

    _mesa_hash_table_u64_insert(ctx->compiled_shaders, id, shader_iova);
 }

 static void
 emit_shader_iova(struct replay_context *ctx, struct cmdstream *cs, uint64_t id)
 {
    uint64_t *shader_iova = (uint64_t *)
       _mesa_hash_table_u64_search(ctx->compiled_shaders, id);
    if (shader_iova) {
       pkt_qw(cs, *shader_iova);
    } else {
       fprintf(stderr,
               "Not override for shader at 0x%" PRIx64 ", using original\n", id);
       pkt_qw(cs, id);
    }
 }

 #define begin_draw_state()                                                     \
    uint64_t subcs_iova_start = cs_get_cur_iova(ctx.state_cs);                  \
    struct cmdstream *prev_cs = cs;                                             \
    struct cmdstream *cs = ctx.state_cs;

 #define end_draw_state(params)                                                 \
    uint64_t subcs_iova_end = cs_get_cur_iova(ctx.state_cs);                    \
    uint32_t subcs_size =                                                       \
       (subcs_iova_end - subcs_iova_start) / sizeof(uint32_t);                  \
    pkt7(prev_cs, CP_SET_DRAW_STATE, 3);                                        \
    pkt(prev_cs, (params) | subcs_size);                                        \
    pkt_qw(prev_cs, subcs_iova_start);

 #define begin_ib()                                                             \
    struct cmdstream *prev_cs = cs;                                             \
    struct cmdstream *cs = cs_alloc(&ctx, 1024 * 1024);

 #define end_ib()                                                               \
    uint64_t ibcs_size = cs->cur;                                               \
    pkt7(prev_cs, CP_INDIRECT_BUFFER, 3);                                       \
    pkt_qw(prev_cs, cs->iova);                                                  \
    pkt(prev_cs, ibcs_size);

 static void
 gpu_print(struct replay_context *ctx, struct cmdstream *_cs, uint64_t iova,
           uint32_t dwords)
 {
    uint64_t header_iova, body_iova;
    struct cmdstream *prev_cs = _cs;
    struct cmdstream *cs = cs_alloc(ctx, 4096);
    /* Commands that are being modified should be in a separate cmdstream,
     * otherwise they would be prefetched and writes would not be visible.
     */
    {
       /* Write size into entry's header */
       pkt7(cs, CP_MEM_WRITE, 4);
       header_iova = cs_get_cur_iova(cs);
       pkt_qw(cs, 0xdeadbeef);
       uint64_t size_iova = cs_get_cur_iova(cs);
       pkt(cs, dwords * 4);
       pkt(cs, 0);

       /* Copy the data into entry's body */
       pkt7(cs, CP_MEMCPY, 5);
       pkt(cs, dwords);
       pkt_qw(cs, iova);
       body_iova = cs_get_cur_iova(cs);
       pkt_qw(cs, 0xdeadbeef);

       /* iova = iova + body_size + header_size */
       pkt7(cs, CP_MEM_TO_MEM, 9);
       pkt(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
       pkt_qw(cs, ctx->cp_log->iova);
       pkt_qw(cs, ctx->cp_log->iova);
       pkt_qw(cs, size_iova);
       pkt_qw(cs, ctx->cp_log->iova + sizeof(uint64_t));
    }

    {
       struct cmdstream *cs = prev_cs;
       pkt7(cs, CP_MEM_TO_MEM, 5);
       pkt(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
       pkt_qw(cs, header_iova);
       pkt_qw(cs, ctx->cp_log->iova);

       pkt7(cs, CP_MEM_TO_MEM, 7);
       pkt(cs, CP_MEM_TO_MEM_0_DOUBLE);
       pkt_qw(cs, body_iova);
       pkt_qw(cs, ctx->cp_log->iova);
       pkt_qw(cs, ctx->cp_log->iova + sizeof(uint64_t));

       pkt7(cs, CP_WAIT_MEM_WRITES, 0);
       pkt7(cs, CP_WAIT_FOR_ME, 0);
    }

    end_ib();
 }

 /* This function is used to read a buffer from the GPU into a file.
  * The buffer can optionally be cleared to 0xdeadbeef at the start
  * of the cmdstream by setting the clear parameter to true.
  *
  * Note: Unlike gpu_print, this function isn't sequenced, it will
  * read the state of the buffer at the end of the cmdstream, not
  * at the point of the call.
  */
 static void
 gpu_read_into_file(struct replay_context *ctx, struct cmdstream *_cs,
                     uint64_t iova, uint64_t size, bool clear, const char *name)
 {
    struct wrbuf *wrbuf = (struct wrbuf *) calloc(1, sizeof(struct wrbuf));
    wrbuf->iova = iova;
    wrbuf->size = size;
    wrbuf->clear = clear;
    wrbuf->name = strdup(name);

    assert(wrbuf->iova != 0);

    list_addtail(&wrbuf->link, &ctx->wrbuf_list);
 }
	/*
	* Copyright © 2022 Igalia S.L.
	* SPDX-License-Identifier: MIT
	*/

	#include <assert.h>
	#include <err.h>
	#include <getopt.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <sys/types.h>

	#include "redump.h"

	#include "util/u_math.h"

	#include "adreno_common.xml.h"
	#include "adreno_pm4.xml.h"
	#include "freedreno_pm4.h"

	#include "a6xx.xml.h"

	#include "ir3/ir3_assembler.h"
	#include "ir3/ir3_compiler.h"
	#include "ir3/ir3_shader.h"

	#include "util/list.h"
	#include "util/vma.h"

	struct cmdstream {
	struct list_head link;

	uint32_t *mem;
	uint32_t total_size;
	uint32_t cur;

	uint64_t iova;
	};

	static uint64_t
	cs_get_cur_iova(struct cmdstream *cs)
	{
	return cs->iova + cs->cur * sizeof(uint32_t);
	}

	struct wrbuf {
	struct list_head link;

	uint64_t iova;
	uint64_t size;
	uint64_t clear;
	const char *name;
	};

	struct replay_context {
	void *mem_ctx;

	struct util_vma_heap vma;

	struct cmdstream *submit_cs;
	struct cmdstream *state_cs;
	struct cmdstream *shader_cs;

	struct cmdstream *shader_log;
	struct cmdstream *cp_log;

	struct list_head cs_list;

	struct list_head wrbuf_list;

	struct ir3_compiler *compiler;

	struct hash_table_u64 *compiled_shaders;

	const char *output_name;
	};

	static void
	pkt(struct cmdstream *cs, uint32_t payload)
	{
	assert(cs->cur <= cs->total_size);
	cs->mem[cs->cur++] = payload;
	}

	static void
	pkt_qw(struct cmdstream *cs, uint64_t payload)
	{
	pkt(cs, payload);
	pkt(cs, payload >> 32);
	}

	static uint64_t
	pkt_blob(struct cmdstream cs, void payload, uint32_t size, uint32_t alignment)
	{
	cs->cur = align(cs->cur, alignment / sizeof(uint32_t));
	uint64_t start_iova = cs_get_cur_iova(cs);

	memcpy(cs->mem + cs->cur, payload, size);
	cs->cur += size;

	return start_iova;
	}

	static void
	pkt4(struct cmdstream *cs, uint16_t regindx, uint16_t cnt, uint32_t payload)
	{
	pkt(cs, pm4_pkt4_hdr(regindx, cnt));
	pkt(cs, payload);
	}

	static void
	pkt7(struct cmdstream *cs, uint8_t opcode, uint16_t cnt)
	{
	pkt(cs, pm4_pkt7_hdr(opcode, cnt));
	}

	struct rd_section {
	uint32_t type;
	uint32_t size;
	};

	static struct cmdstream *
	cs_alloc(struct replay_context *ctx, uint32_t size)
	{
	struct cmdstream cs = (struct cmdstream ) calloc(1, sizeof(struct cmdstream));
	cs->mem = (uint32_t *)calloc(1, size);
	cs->total_size = size / sizeof(uint32_t);
	cs->cur = 0;
	cs->iova = util_vma_heap_alloc(&ctx->vma, size, 4096);

	assert(cs->iova != 0);

	list_addtail(&cs->link, &ctx->cs_list);

	return cs;
	}

	static void
	rd_write_gpu_addr_section(FILE out, struct cmdstream cs, enum rd_sect_type section)
	{
	const uint32_t packet[] = {(uint32_t)cs->iova,
	(uint32_t)(cs->cur * sizeof(uint32_t)),
	(uint32_t)(cs->iova >> 32)};
	struct rd_section section_address = {.type = section,
	.size = sizeof(packet)};
	fwrite(&section_address, sizeof(section_address), 1, out);
	fwrite(packet, sizeof(packet), 1, out);
	}

	static void
	rd_write_cs_buffer(FILE out, struct cmdstream cs)
	{
	if (cs->cur == 0)
	return;

	rd_write_gpu_addr_section(out, cs, RD_GPUADDR);

	struct rd_section section_contents = {.type = RD_BUFFER_CONTENTS,
	.size = uint32_t(cs->cur * sizeof(uint32_t))};

	fwrite(&section_contents, sizeof(section_contents), 1, out);
	fwrite(cs->mem, sizeof(uint32_t), cs->cur, out);
	}

	static void
	rd_write_cs_submit(FILE out, struct cmdstream cs)
	{
	const uint32_t packet[] = {(uint32_t)cs->iova, cs->cur,
	(uint32_t)(cs->iova >> 32)};
	struct rd_section section_cmdstream = {.type = RD_CMDSTREAM_ADDR,
	.size = sizeof(packet)};

	fwrite(&section_cmdstream, sizeof(section_cmdstream), 1, out);
	fwrite(packet, sizeof(packet), 1, out);
	}

	static void
	rd_write_wrbuffer(FILE out, struct wrbuf wrbuf)
	{
	uint32_t name_len = strlen(wrbuf->name) + 1;
	struct rd_section section = {.type = RD_WRBUFFER,
	.size = (uint32_t)(sizeof(uint64_t) * 3) + name_len};
	fwrite(&section, sizeof(section), 1, out);
	fwrite(&wrbuf->iova, sizeof(uint64_t), 1, out);
	fwrite(&wrbuf->size, sizeof(uint64_t), 1, out);
	fwrite(&wrbuf->clear, sizeof(uint64_t), 1, out);
	fwrite(wrbuf->name, sizeof(char), name_len, out);
	}

	static void
	print_usage(const char *name)
	{
	/* clang-format off */
	fprintf(stderr, "Usage:\n\n"
	"\t%s [OPTIONS]... FILE...\n\n"
	"Options:\n"
	"\t --vastart=offset\n"
	"\t --vasize=size\n"
	"\t-h, --help - show this message\n"
	, name);
	/* clang-format on */
	exit(2);
	}

	#define OPT_VA_START 1000
	#define OPT_VA_SIZE 1001

	/* clang-format off */
	static const struct option opts[] = {
	{ "vastart", required_argument, 0, OPT_VA_START },
	{ "vasize", required_argument, 0, OPT_VA_SIZE },
	{ "help", no_argument, 0, 'h' },
	};
	/* clang-format on */

	static void
	replay_context_init(struct replay_context ctx, struct fd_dev_id dev_id,
	int argc, char **argv)
	{
	uint64_t va_start = 0;
	uint64_t va_size = 0;

	int c;
	while ((c = getopt_long(argc, argv, "h", opts, NULL)) != -1) {
	switch (c) {
	case OPT_VA_START:
	va_start = strtoull(optarg, NULL, 0);
	break;
	case OPT_VA_SIZE:
	va_size = strtoull(optarg, NULL, 0);
	break;
	case 'h':
	default:
	print_usage(argv[0]);
	}
	}

	if (optind < argc) {
	ctx->output_name = argv[optind];
	} else {
	}

	if (!va_start \|\| !va_size \|\| !ctx->output_name) {
	print_usage(argv[0]);
	exit(1);
	}

	ctx->mem_ctx = ralloc_context(NULL);
	list_inithead(&ctx->cs_list);
	list_inithead(&ctx->wrbuf_list);

	util_vma_heap_init(&ctx->vma, va_start, ROUND_DOWN_TO(va_size, 4096));

	ctx->submit_cs = cs_alloc(ctx, 1024 * 1024);
	ctx->state_cs = cs_alloc(ctx, 2 * 1024 * 1024);
	ctx->shader_cs = cs_alloc(ctx, 8 * 1024 * 1024);

	ctx->shader_log = cs_alloc(ctx, 1024 * 1024);
	ctx->shader_log->mem[0] = (ctx->shader_log->iova & 0xffffffff) + sizeof(uint64_t);
	ctx->shader_log->mem[1] = ctx->shader_log->iova >> 32;
	ctx->shader_log->cur = ctx->shader_log->total_size;

	ctx->cp_log = cs_alloc(ctx, 8 * 1024 * 1024);
	((uint64_t )ctx->cp_log->mem)[0] = ctx->cp_log->iova + 2 sizeof(uint64_t);
	((uint64_t *)ctx->cp_log->mem)[1] = sizeof(uint64_t);
	ctx->cp_log->cur = ctx->cp_log->total_size;

	struct ir3_compiler_options options{
	.disable_cache = true,
	};
	ctx->compiler =
	ir3_compiler_create(NULL, dev_id, fd_dev_info_raw(dev_id), &options);
	ctx->compiled_shaders = _mesa_hash_table_u64_create(ctx->mem_ctx);
	}

	static void
	replay_context_finish(struct replay_context *ctx)
	{
	FILE *out = fopen(ctx->output_name, "w");
	if (!out) {
	errx(1, "Cannot open '%s' for writing\n", ctx->output_name);
	}

	static const uint32_t gpu_id = 660;
	struct rd_section section_gpu_id = {.type = RD_GPU_ID,
	.size = 1 * sizeof(uint32_t)};
	fwrite(&section_gpu_id, sizeof(section_gpu_id), 1, out);
	fwrite(&gpu_id, sizeof(uint32_t), 1, out);

	rd_write_gpu_addr_section(out, ctx->shader_log, RD_SHADER_LOG_BUFFER);
	rd_write_gpu_addr_section(out, ctx->cp_log, RD_CP_LOG_BUFFER);

	list_for_each_entry (struct cmdstream, cs, &ctx->cs_list, link) {
	rd_write_cs_buffer(out, cs);
	}
	rd_write_cs_submit(out, ctx->submit_cs);

	list_for_each_entry (struct wrbuf, wrbuf, &ctx->wrbuf_list, link) {
	rd_write_wrbuffer(out, wrbuf);
	}

	fclose(out);
	}

	static void
	upload_shader(struct replay_context ctx, uint64_t id, const char source)
	{
	FILE in = fmemopen((void )source, strlen(source), "r");

	struct ir3_kernel_info info = {
	.shader_print_buffer_iova = ctx->shader_log->iova,
	};
	struct ir3_shader *shader = ir3_parse_asm(ctx->compiler, &info, in);
	assert(shader);

	fclose(in);

	uint64_t *shader_iova = ralloc(ctx->mem_ctx, uint64_t);
	*shader_iova = pkt_blob(ctx->shader_cs, shader->variants->bin,
	shader->variants->info.size, 128);
	ralloc_free(shader);

	_mesa_hash_table_u64_insert(ctx->compiled_shaders, id, shader_iova);
	}

	static void
	emit_shader_iova(struct replay_context ctx, struct cmdstream cs, uint64_t id)
	{
	uint64_t shader_iova = (uint64_t )
	_mesa_hash_table_u64_search(ctx->compiled_shaders, id);
	if (shader_iova) {
	pkt_qw(cs, *shader_iova);
	} else {
	fprintf(stderr,
	"Not override for shader at 0x%" PRIx64 ", using original\n", id);
	pkt_qw(cs, id);
	}
	}

	#define begin_draw_state() \
	uint64_t subcs_iova_start = cs_get_cur_iova(ctx.state_cs); \
	struct cmdstream *prev_cs = cs; \
	struct cmdstream *cs = ctx.state_cs;

	#define end_draw_state(params) \
	uint64_t subcs_iova_end = cs_get_cur_iova(ctx.state_cs); \
	uint32_t subcs_size = \
	(subcs_iova_end - subcs_iova_start) / sizeof(uint32_t); \
	pkt7(prev_cs, CP_SET_DRAW_STATE, 3); \
	pkt(prev_cs, (params) \| subcs_size); \
	pkt_qw(prev_cs, subcs_iova_start);

	#define begin_ib() \
	struct cmdstream *prev_cs = cs; \
	struct cmdstream cs = cs_alloc(&ctx, 1024 1024);

	#define end_ib() \
	uint64_t ibcs_size = cs->cur; \
	pkt7(prev_cs, CP_INDIRECT_BUFFER, 3); \
	pkt_qw(prev_cs, cs->iova); \
	pkt(prev_cs, ibcs_size);

	static void
	gpu_print(struct replay_context ctx, struct cmdstream _cs, uint64_t iova,
	uint32_t dwords)
	{
	uint64_t header_iova, body_iova;
	struct cmdstream *prev_cs = _cs;
	struct cmdstream *cs = cs_alloc(ctx, 4096);
	/* Commands that are being modified should be in a separate cmdstream,
	* otherwise they would be prefetched and writes would not be visible.
	*/
	{
	/* Write size into entry's header */
	pkt7(cs, CP_MEM_WRITE, 4);
	header_iova = cs_get_cur_iova(cs);
	pkt_qw(cs, 0xdeadbeef);
	uint64_t size_iova = cs_get_cur_iova(cs);
	pkt(cs, dwords * 4);
	pkt(cs, 0);

	/* Copy the data into entry's body */
	pkt7(cs, CP_MEMCPY, 5);
	pkt(cs, dwords);
	pkt_qw(cs, iova);
	body_iova = cs_get_cur_iova(cs);
	pkt_qw(cs, 0xdeadbeef);

	/* iova = iova + body_size + header_size */
	pkt7(cs, CP_MEM_TO_MEM, 9);
	pkt(cs, CP_MEM_TO_MEM_0_DOUBLE \| CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
	pkt_qw(cs, ctx->cp_log->iova);
	pkt_qw(cs, ctx->cp_log->iova);
	pkt_qw(cs, size_iova);
	pkt_qw(cs, ctx->cp_log->iova + sizeof(uint64_t));
	}

	{
	struct cmdstream *cs = prev_cs;
	pkt7(cs, CP_MEM_TO_MEM, 5);
	pkt(cs, CP_MEM_TO_MEM_0_DOUBLE \| CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES);
	pkt_qw(cs, header_iova);
	pkt_qw(cs, ctx->cp_log->iova);

	pkt7(cs, CP_MEM_TO_MEM, 7);
	pkt(cs, CP_MEM_TO_MEM_0_DOUBLE);
	pkt_qw(cs, body_iova);
	pkt_qw(cs, ctx->cp_log->iova);
	pkt_qw(cs, ctx->cp_log->iova + sizeof(uint64_t));

	pkt7(cs, CP_WAIT_MEM_WRITES, 0);
	pkt7(cs, CP_WAIT_FOR_ME, 0);
	}

	end_ib();
	}

	/* This function is used to read a buffer from the GPU into a file.
	* The buffer can optionally be cleared to 0xdeadbeef at the start
	* of the cmdstream by setting the clear parameter to true.
	*
	* Note: Unlike gpu_print, this function isn't sequenced, it will
	* read the state of the buffer at the end of the cmdstream, not
	* at the point of the call.
	*/
	static void
	gpu_read_into_file(struct replay_context ctx, struct cmdstream _cs,
	uint64_t iova, uint64_t size, bool clear, const char *name)
	{
	struct wrbuf wrbuf = (struct wrbuf ) calloc(1, sizeof(struct wrbuf));
	wrbuf->iova = iova;
	wrbuf->size = size;
	wrbuf->clear = clear;
	wrbuf->name = strdup(name);

	assert(wrbuf->iova != 0);

	list_addtail(&wrbuf->link, &ctx->wrbuf_list);
	}