src/freedreno/ir3/ir3_compiler.c - third_party/mesa - Git at Google

 /*
  * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  * Authors:
  *    Rob Clark <robclark@freedesktop.org>
  */

 #include "util/ralloc.h"

 #include "freedreno_dev_info.h"

 #include "ir3_compiler.h"

 static const struct debug_named_value shader_debug_options[] = {
    /* clang-format off */
    {"vs",         IR3_DBG_SHADER_VS,  "Print shader disasm for vertex shaders"},
    {"tcs",        IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
    {"tes",        IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
    {"gs",         IR3_DBG_SHADER_GS,  "Print shader disasm for geometry shaders"},
    {"fs",         IR3_DBG_SHADER_FS,  "Print shader disasm for fragment shaders"},
    {"cs",         IR3_DBG_SHADER_CS,  "Print shader disasm for compute shaders"},
    {"internal",   IR3_DBG_SHADER_INTERNAL, "Print shader disasm for internal shaders (normally not included in vs/fs/cs/etc)"},
    {"disasm",     IR3_DBG_DISASM,     "Dump NIR and adreno shader disassembly"},
    {"optmsgs",    IR3_DBG_OPTMSGS,    "Enable optimizer debug messages"},
    {"forces2en",  IR3_DBG_FORCES2EN,  "Force s2en mode for tex sampler instructions"},
    {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
    {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
    {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
    {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
    {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
 #ifdef DEBUG
    /* DEBUG-only options: */
    {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
    {"ramsgs",     IR3_DBG_RAMSGS,     "Enable register-allocation debug messages"},
 #endif
    DEBUG_NAMED_VALUE_END
    /* clang-format on */
 };

 DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
                             shader_debug_options, 0)
 DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
                       NULL)

 enum ir3_shader_debug ir3_shader_debug = 0;
 const char *ir3_shader_override_path = NULL;

 void
 ir3_compiler_destroy(struct ir3_compiler *compiler)
 {
    disk_cache_destroy(compiler->disk_cache);
    ralloc_free(compiler);
 }

 static const nir_shader_compiler_options ir3_base_options = {
    .lower_fpow = true,
    .lower_scmp = true,
    .lower_flrp16 = true,
    .lower_flrp32 = true,
    .lower_flrp64 = true,
    .lower_ffract = true,
    .lower_fmod = true,
    .lower_fdiv = true,
    .lower_isign = true,
    .lower_ldexp = true,
    .lower_uadd_carry = true,
    .lower_usub_borrow = true,
    .lower_mul_high = true,
    .lower_mul_2x32_64 = true,
    .fuse_ffma16 = true,
    .fuse_ffma32 = true,
    .fuse_ffma64 = true,
    .vertex_id_zero_based = false,
    .lower_extract_byte = true,
    .lower_extract_word = true,
    .lower_insert_byte = true,
    .lower_insert_word = true,
    .lower_helper_invocation = true,
    .lower_bitfield_insert_to_shifts = true,
    .lower_bitfield_extract_to_shifts = true,
    .lower_pack_half_2x16 = true,
    .lower_pack_snorm_4x8 = true,
    .lower_pack_snorm_2x16 = true,
    .lower_pack_unorm_4x8 = true,
    .lower_pack_unorm_2x16 = true,
    .lower_unpack_half_2x16 = true,
    .lower_unpack_snorm_4x8 = true,
    .lower_unpack_snorm_2x16 = true,
    .lower_unpack_unorm_4x8 = true,
    .lower_unpack_unorm_2x16 = true,
    .lower_pack_split = true,
    .use_interpolated_input_intrinsics = true,
    .lower_rotate = true,
    .lower_to_scalar = true,
    .has_imul24 = true,
    .has_fsub = true,
    .has_isub = true,
    .force_indirect_unrolling_sampler = true,
    .lower_uniforms_to_ubo = true,
    .use_scoped_barrier = true,
    .max_unroll_iterations = 32,

    .lower_cs_local_index_to_id = true,
    .lower_wpos_pntc = true,

    .lower_int64_options = (nir_lower_int64_options)~0,
    .lower_doubles_options = (nir_lower_doubles_options)~0,
 };

 struct ir3_compiler *
 ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
                     const struct ir3_compiler_options *options)
 {
    struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);

    ir3_shader_debug = debug_get_option_ir3_shader_debug();
    ir3_shader_override_path =
       !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;

    if (ir3_shader_override_path) {
       ir3_shader_debug |= IR3_DBG_NOCACHE;
    }

    compiler->dev = dev;
    compiler->dev_id = dev_id;
    compiler->gen = fd_dev_gen(dev_id);
    compiler->robust_buffer_access2 = options->robust_buffer_access2;

    /* All known GPU's have 32k local memory (aka shared) */
    compiler->local_mem_size = 32 * 1024;
    /* TODO see if older GPU's were different here */
    compiler->branchstack_size = 64;
    compiler->wave_granularity = 2;
    compiler->max_waves = 16;

    compiler->max_variable_workgroup_size = 1024;

    const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);

    if (compiler->gen >= 6) {
       compiler->samgq_workaround = true;
       /* a6xx split the pipeline state into geometry and fragment state, in
        * order to let the VS run ahead of the FS. As a result there are now
        * separate const files for the the fragment shader and everything
        * else, and separate limits. There seems to be a shared limit, but
        * it's higher than the vert or frag limits.
        *
        * Also, according to the observation on a630/a650/a660, max_const_pipeline
        * has to be 512 when all geometry stages are present. Otherwise a gpu hang
        * happens. Accordingly maximum safe size for each stage should be under
        * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
        * const files.
        *
        * Only when VS and FS stages are present, the limit is 640.
        *
        * TODO: The shared limit seems to be different on different models.
        */
       compiler->max_const_pipeline = 512;
       compiler->max_const_frag = 512;
       compiler->max_const_geom = 512;
       compiler->max_const_safe = 100;

       /* Compute shaders don't share a const file with the FS. Instead they
        * have their own file, which is smaller than the FS one.
        *
        * TODO: is this true on earlier gen's?
        */
       compiler->max_const_compute = 256;

       /* TODO: implement clip+cull distances on earlier gen's */
       compiler->has_clip_cull = true;

       /* TODO: implement private memory on earlier gen's */
       compiler->has_pvtmem = true;

       compiler->has_preamble = true;

       compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;

       compiler->storage_16bit = dev_info->a6xx.storage_16bit;

       compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;

       compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
       compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;

       compiler->shared_consts_base_offset = 504;
       compiler->shared_consts_size = 8;
       compiler->geom_shared_consts_size_quirk = 16;
    } else {
       compiler->max_const_pipeline = 512;
       compiler->max_const_geom = 512;
       compiler->max_const_frag = 512;
       compiler->max_const_compute = 512;

       /* Note: this will have to change if/when we support tess+GS on
        * earlier gen's.
        */
       compiler->max_const_safe = 256;
    }

    if (compiler->gen >= 6) {
       compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
    } else if (compiler->gen >= 4) {
       /* On a4xx-a5xx, using r24.x and above requires using the smallest
        * threadsize.
        */
       compiler->reg_size_vec4 = 48;
    } else {
       /* TODO: confirm this */
       compiler->reg_size_vec4 = 96;
    }

    if (compiler->gen >= 6) {
       compiler->threadsize_base = 64;
    } else if (compiler->gen >= 4) {
       /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
        * 1.1 subgroupSize which is 32.
        */
       compiler->threadsize_base = 32;
    } else {
       compiler->threadsize_base = 8;
    }

    if (compiler->gen >= 4) {
       /* need special handling for "flat" */
       compiler->flat_bypass = true;
       compiler->levels_add_one = false;
       compiler->unminify_coords = false;
       compiler->txf_ms_with_isaml = false;
       compiler->array_index_add_half = true;
       compiler->instr_align = 16;
       compiler->const_upload_unit = 4;
    } else {
       /* no special handling for "flat" */
       compiler->flat_bypass = false;
       compiler->levels_add_one = true;
       compiler->unminify_coords = true;
       compiler->txf_ms_with_isaml = true;
       compiler->array_index_add_half = false;
       compiler->instr_align = 4;
       compiler->const_upload_unit = 8;
    }

    compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
    compiler->has_shared_regfile = compiler->gen >= 5;

    compiler->push_ubo_with_preamble = options->push_ubo_with_preamble;

    /* The driver can't request this unless preambles are supported. */
    if (options->push_ubo_with_preamble)
       assert(compiler->has_preamble);

    /* Set up nir shader compiler options, using device-specific overrides of our base settings. */
    compiler->nir_options = ir3_base_options;

    if (compiler->gen >= 6) {
       compiler->nir_options.vectorize_io = true,
       compiler->nir_options.force_indirect_unrolling = nir_var_all,

       compiler->nir_options.lower_device_index_to_zero = true,
       compiler->nir_options.has_udot_4x8 = true,
       compiler->nir_options.has_sudot_4x8 = true,
       compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
       compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc;
    } else if (compiler->gen >= 3 && compiler->gen <= 5) {
       compiler->nir_options.vertex_id_zero_based = true;
    } else if (compiler->gen <= 2) {
       /* a2xx compiler doesn't handle indirect: */
       compiler->nir_options.force_indirect_unrolling = nir_var_all;
    }

    /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
     * this core NIR option enables some optimizations of 16-bit operations.
     */
    if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
       compiler->nir_options.support_16bit_alu = true;

    if (!options->disable_cache)
       ir3_disk_cache_init(compiler);

    return compiler;
 }

 const nir_shader_compiler_options *
 ir3_get_compiler_options(struct ir3_compiler *compiler)
 {
    return &compiler->nir_options;
 }
	/*
	* Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	* Authors:
	* Rob Clark <robclark@freedesktop.org>
	*/

	#include "util/ralloc.h"

	#include "freedreno_dev_info.h"

	#include "ir3_compiler.h"

	static const struct debug_named_value shader_debug_options[] = {
	/* clang-format off */
	{"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
	{"tcs", IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
	{"tes", IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
	{"gs", IR3_DBG_SHADER_GS, "Print shader disasm for geometry shaders"},
	{"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
	{"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
	{"internal", IR3_DBG_SHADER_INTERNAL, "Print shader disasm for internal shaders (normally not included in vs/fs/cs/etc)"},
	{"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
	{"optmsgs", IR3_DBG_OPTMSGS, "Enable optimizer debug messages"},
	{"forces2en", IR3_DBG_FORCES2EN, "Force s2en mode for tex sampler instructions"},
	{"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"},
	{"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"},
	{"nocache", IR3_DBG_NOCACHE, "Disable shader cache"},
	{"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"},
	{"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
	#ifdef DEBUG
	/* DEBUG-only options: */
	{"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"},
	{"ramsgs", IR3_DBG_RAMSGS, "Enable register-allocation debug messages"},
	#endif
	DEBUG_NAMED_VALUE_END
	/* clang-format on */
	};

	DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
	shader_debug_options, 0)
	DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
	NULL)

	enum ir3_shader_debug ir3_shader_debug = 0;
	const char *ir3_shader_override_path = NULL;

	void
	ir3_compiler_destroy(struct ir3_compiler *compiler)
	{
	disk_cache_destroy(compiler->disk_cache);
	ralloc_free(compiler);
	}

	static const nir_shader_compiler_options ir3_base_options = {
	.lower_fpow = true,
	.lower_scmp = true,
	.lower_flrp16 = true,
	.lower_flrp32 = true,
	.lower_flrp64 = true,
	.lower_ffract = true,
	.lower_fmod = true,
	.lower_fdiv = true,
	.lower_isign = true,
	.lower_ldexp = true,
	.lower_uadd_carry = true,
	.lower_usub_borrow = true,
	.lower_mul_high = true,
	.lower_mul_2x32_64 = true,
	.fuse_ffma16 = true,
	.fuse_ffma32 = true,
	.fuse_ffma64 = true,
	.vertex_id_zero_based = false,
	.lower_extract_byte = true,
	.lower_extract_word = true,
	.lower_insert_byte = true,
	.lower_insert_word = true,
	.lower_helper_invocation = true,
	.lower_bitfield_insert_to_shifts = true,
	.lower_bitfield_extract_to_shifts = true,
	.lower_pack_half_2x16 = true,
	.lower_pack_snorm_4x8 = true,
	.lower_pack_snorm_2x16 = true,
	.lower_pack_unorm_4x8 = true,
	.lower_pack_unorm_2x16 = true,
	.lower_unpack_half_2x16 = true,
	.lower_unpack_snorm_4x8 = true,
	.lower_unpack_snorm_2x16 = true,
	.lower_unpack_unorm_4x8 = true,
	.lower_unpack_unorm_2x16 = true,
	.lower_pack_split = true,
	.use_interpolated_input_intrinsics = true,
	.lower_rotate = true,
	.lower_to_scalar = true,
	.has_imul24 = true,
	.has_fsub = true,
	.has_isub = true,
	.force_indirect_unrolling_sampler = true,
	.lower_uniforms_to_ubo = true,
	.use_scoped_barrier = true,
	.max_unroll_iterations = 32,

	.lower_cs_local_index_to_id = true,
	.lower_wpos_pntc = true,

	.lower_int64_options = (nir_lower_int64_options)~0,
	.lower_doubles_options = (nir_lower_doubles_options)~0,
	};

	struct ir3_compiler *
	ir3_compiler_create(struct fd_device dev, const struct fd_dev_id dev_id,
	const struct ir3_compiler_options *options)
	{
	struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);

	ir3_shader_debug = debug_get_option_ir3_shader_debug();
	ir3_shader_override_path =
	!__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;

	if (ir3_shader_override_path) {
	ir3_shader_debug \|= IR3_DBG_NOCACHE;
	}

	compiler->dev = dev;
	compiler->dev_id = dev_id;
	compiler->gen = fd_dev_gen(dev_id);
	compiler->robust_buffer_access2 = options->robust_buffer_access2;

	/* All known GPU's have 32k local memory (aka shared) */
	compiler->local_mem_size = 32 * 1024;
	/* TODO see if older GPU's were different here */
	compiler->branchstack_size = 64;
	compiler->wave_granularity = 2;
	compiler->max_waves = 16;

	compiler->max_variable_workgroup_size = 1024;

	const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);

	if (compiler->gen >= 6) {
	compiler->samgq_workaround = true;
	/* a6xx split the pipeline state into geometry and fragment state, in
	* order to let the VS run ahead of the FS. As a result there are now
	* separate const files for the the fragment shader and everything
	* else, and separate limits. There seems to be a shared limit, but
	* it's higher than the vert or frag limits.
	*
	* Also, according to the observation on a630/a650/a660, max_const_pipeline
	* has to be 512 when all geometry stages are present. Otherwise a gpu hang
	* happens. Accordingly maximum safe size for each stage should be under
	* (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
	* const files.
	*
	* Only when VS and FS stages are present, the limit is 640.
	*
	* TODO: The shared limit seems to be different on different models.
	*/
	compiler->max_const_pipeline = 512;
	compiler->max_const_frag = 512;
	compiler->max_const_geom = 512;
	compiler->max_const_safe = 100;

	/* Compute shaders don't share a const file with the FS. Instead they
	* have their own file, which is smaller than the FS one.
	*
	* TODO: is this true on earlier gen's?
	*/
	compiler->max_const_compute = 256;

	/* TODO: implement clip+cull distances on earlier gen's */
	compiler->has_clip_cull = true;

	/* TODO: implement private memory on earlier gen's */
	compiler->has_pvtmem = true;

	compiler->has_preamble = true;

	compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;

	compiler->storage_16bit = dev_info->a6xx.storage_16bit;

	compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;

	compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
	compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;

	compiler->shared_consts_base_offset = 504;
	compiler->shared_consts_size = 8;
	compiler->geom_shared_consts_size_quirk = 16;
	} else {
	compiler->max_const_pipeline = 512;
	compiler->max_const_geom = 512;
	compiler->max_const_frag = 512;
	compiler->max_const_compute = 512;

	/* Note: this will have to change if/when we support tess+GS on
	* earlier gen's.
	*/
	compiler->max_const_safe = 256;
	}

	if (compiler->gen >= 6) {
	compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
	} else if (compiler->gen >= 4) {
	/* On a4xx-a5xx, using r24.x and above requires using the smallest
	* threadsize.
	*/
	compiler->reg_size_vec4 = 48;
	} else {
	/* TODO: confirm this */
	compiler->reg_size_vec4 = 96;
	}

	if (compiler->gen >= 6) {
	compiler->threadsize_base = 64;
	} else if (compiler->gen >= 4) {
	/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
	* 1.1 subgroupSize which is 32.
	*/
	compiler->threadsize_base = 32;
	} else {
	compiler->threadsize_base = 8;
	}

	if (compiler->gen >= 4) {
	/* need special handling for "flat" */
	compiler->flat_bypass = true;
	compiler->levels_add_one = false;
	compiler->unminify_coords = false;
	compiler->txf_ms_with_isaml = false;
	compiler->array_index_add_half = true;
	compiler->instr_align = 16;
	compiler->const_upload_unit = 4;
	} else {
	/* no special handling for "flat" */
	compiler->flat_bypass = false;
	compiler->levels_add_one = true;
	compiler->unminify_coords = true;
	compiler->txf_ms_with_isaml = true;
	compiler->array_index_add_half = false;
	compiler->instr_align = 4;
	compiler->const_upload_unit = 8;
	}

	compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
	compiler->has_shared_regfile = compiler->gen >= 5;

	compiler->push_ubo_with_preamble = options->push_ubo_with_preamble;

	/* The driver can't request this unless preambles are supported. */
	if (options->push_ubo_with_preamble)
	assert(compiler->has_preamble);

	/* Set up nir shader compiler options, using device-specific overrides of our base settings. */
	compiler->nir_options = ir3_base_options;

	if (compiler->gen >= 6) {
	compiler->nir_options.vectorize_io = true,
	compiler->nir_options.force_indirect_unrolling = nir_var_all,

	compiler->nir_options.lower_device_index_to_zero = true,
	compiler->nir_options.has_udot_4x8 = true,
	compiler->nir_options.has_sudot_4x8 = true,
	compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
	compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc;
	} else if (compiler->gen >= 3 && compiler->gen <= 5) {
	compiler->nir_options.vertex_id_zero_based = true;
	} else if (compiler->gen <= 2) {
	/* a2xx compiler doesn't handle indirect: */
	compiler->nir_options.force_indirect_unrolling = nir_var_all;
	}

	/* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
	* this core NIR option enables some optimizations of 16-bit operations.
	*/
	if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
	compiler->nir_options.support_16bit_alu = true;

	if (!options->disable_cache)
	ir3_disk_cache_init(compiler);

	return compiler;
	}

	const nir_shader_compiler_options *
	ir3_get_compiler_options(struct ir3_compiler *compiler)
	{
	return &compiler->nir_options;
	}