src/freedreno/ir3/ir3_context.c - third_party/mesa - Git at Google

 /*
  * Copyright (C) 2015-2018 Rob Clark <robclark@freedesktop.org>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
  * Authors:
  *    Rob Clark <robclark@freedesktop.org>
  */

 #include "ir3_context.h"
 #include "ir3_compiler.h"
 #include "ir3_image.h"
 #include "ir3_nir.h"
 #include "ir3_shader.h"

 struct ir3_context *
 ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
                  struct ir3_shader_variant *so)
 {
    struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);

    if (compiler->gen == 4) {
       if (so->type == MESA_SHADER_VERTEX) {
          ctx->astc_srgb = so->key.vastc_srgb;
          memcpy(ctx->sampler_swizzles, so->key.vsampler_swizzles, sizeof(ctx->sampler_swizzles));
       } else if (so->type == MESA_SHADER_FRAGMENT ||
             so->type == MESA_SHADER_COMPUTE) {
          ctx->astc_srgb = so->key.fastc_srgb;
          memcpy(ctx->sampler_swizzles, so->key.fsampler_swizzles, sizeof(ctx->sampler_swizzles));
       }
    } else if (compiler->gen == 3) {
       if (so->type == MESA_SHADER_VERTEX) {
          ctx->samples = so->key.vsamples;
       } else if (so->type == MESA_SHADER_FRAGMENT) {
          ctx->samples = so->key.fsamples;
       }
    }

    if (compiler->gen >= 6) {
       ctx->funcs = &ir3_a6xx_funcs;
    } else if (compiler->gen >= 4) {
       ctx->funcs = &ir3_a4xx_funcs;
    }

    ctx->compiler = compiler;
    ctx->so = so;
    ctx->def_ht =
       _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
    ctx->block_ht =
       _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
    ctx->continue_block_ht =
       _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
    ctx->sel_cond_conversions =
       _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);

    /* TODO: maybe generate some sort of bitmask of what key
     * lowers vs what shader has (ie. no need to lower
     * texture clamp lowering if no texture sample instrs)..
     * although should be done further up the stack to avoid
     * creating duplicate variants..
     */

    ctx->s = nir_shader_clone(ctx, shader->nir);
    ir3_nir_lower_variant(so, ctx->s);

    /* this needs to be the last pass run, so do this here instead of
     * in ir3_optimize_nir():
     */
    bool progress = false;
    NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs);

    /* we could need cleanup after lower_locals_to_regs */
    while (progress) {
       progress = false;
       NIR_PASS(progress, ctx->s, nir_opt_algebraic);
       NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
    }

    /* We want to lower nir_op_imul as late as possible, to catch also
     * those generated by earlier passes (e.g, nir_lower_locals_to_regs).
     * However, we want a final swing of a few passes to have a chance
     * at optimizing the result.
     */
    progress = false;
    NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
    while (progress) {
       progress = false;
       NIR_PASS(progress, ctx->s, nir_opt_algebraic);
       NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars);
       NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars);
       NIR_PASS(progress, ctx->s, nir_opt_dce);
       NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
    }

    /* Enable the texture pre-fetch feature only a4xx onwards.  But
     * only enable it on generations that have been tested:
     */
    if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gen >= 6))
       NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);

    NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);

    /* Super crude heuristic to limit # of tex prefetch in small
     * shaders.  This completely ignores loops.. but that's really
     * not the worst of it's problems.  (A frag shader that has
     * loops is probably going to be big enough to not trigger a
     * lower threshold.)
     *
     *   1) probably want to do this in terms of ir3 instructions
     *   2) probably really want to decide this after scheduling
     *      (or at least pre-RA sched) so we have a rough idea about
     *      nops, and don't count things that get cp'd away
     *   3) blob seems to use higher thresholds with a mix of more
     *      SFU instructions.  Which partly makes sense, more SFU
     *      instructions probably means you want to get the real
     *      shader started sooner, but that considers where in the
     *      shader the SFU instructions are, which blob doesn't seem
     *      to do.
     *
     * This uses more conservative thresholds assuming a more alu
     * than sfu heavy instruction mix.
     */
    if (so->type == MESA_SHADER_FRAGMENT) {
       nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);

       unsigned instruction_count = 0;
       nir_foreach_block (block, fxn) {
          instruction_count += exec_list_length(&block->instr_list);
       }

       if (instruction_count < 50) {
          ctx->prefetch_limit = 2;
       } else if (instruction_count < 70) {
          ctx->prefetch_limit = 3;
       } else {
          ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
       }
    }

    if (shader_debug_enabled(so->type, ctx->s->info.internal)) {
       mesa_logi("NIR (final form) for %s shader %s:", ir3_shader_stage(so),
                 so->name);
       nir_log_shaderi(ctx->s);
    }

    ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures);

    return ctx;
 }

 void
 ir3_context_free(struct ir3_context *ctx)
 {
    ralloc_free(ctx);
 }

 /*
  * Misc helpers
  */

 /* allocate a n element value array (to be populated by caller) and
  * insert in def_ht
  */
 struct ir3_instruction **
 ir3_get_dst_ssa(struct ir3_context *ctx, nir_ssa_def *dst, unsigned n)
 {
    struct ir3_instruction **value =
       ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
    _mesa_hash_table_insert(ctx->def_ht, dst, value);
    return value;
 }

 struct ir3_instruction **
 ir3_get_dst(struct ir3_context *ctx, nir_dest *dst, unsigned n)
 {
    struct ir3_instruction **value;

    if (dst->is_ssa) {
       value = ir3_get_dst_ssa(ctx, &dst->ssa, n);
    } else {
       value = ralloc_array(ctx, struct ir3_instruction *, n);
    }

    /* NOTE: in non-ssa case, we don't really need to store last_dst
     * but this helps us catch cases where put_dst() call is forgotten
     */
    compile_assert(ctx, !ctx->last_dst);
    ctx->last_dst = value;
    ctx->last_dst_n = n;

    return value;
 }

 struct ir3_instruction *const *
 ir3_get_src(struct ir3_context *ctx, nir_src *src)
 {
    if (src->is_ssa) {
       struct hash_entry *entry;
       entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
       compile_assert(ctx, entry);
       return entry->data;
    } else {
       nir_register *reg = src->reg.reg;
       struct ir3_array *arr = ir3_get_array(ctx, reg);
       unsigned num_components = arr->r->num_components;
       struct ir3_instruction *addr = NULL;
       struct ir3_instruction **value =
          ralloc_array(ctx, struct ir3_instruction *, num_components);

       if (src->reg.indirect)
          addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
                               reg->num_components);

       for (unsigned i = 0; i < num_components; i++) {
          unsigned n = src->reg.base_offset * reg->num_components + i;
          compile_assert(ctx, n < arr->length);
          value[i] = ir3_create_array_load(ctx, arr, n, addr);
       }

       return value;
    }
 }

 void
 ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
 {
    unsigned bit_size = ir3_bitsize(ctx, nir_dest_bit_size(*dst));

    /* add extra mov if dst value is shared reg.. in some cases not all
     * instructions can read from shared regs, in cases where they can
     * ir3_cp will clean up the extra mov:
     */
    for (unsigned i = 0; i < ctx->last_dst_n; i++) {
       if (!ctx->last_dst[i])
          continue;
       if (ctx->last_dst[i]->dsts[0]->flags & IR3_REG_SHARED) {
          ctx->last_dst[i] = ir3_MOV(ctx->block, ctx->last_dst[i], TYPE_U32);
       }
    }

    if (bit_size <= 16) {
       for (unsigned i = 0; i < ctx->last_dst_n; i++) {
          struct ir3_instruction *dst = ctx->last_dst[i];
          ir3_set_dst_type(dst, true);
          ir3_fixup_src_type(dst);
          if (dst->opc == OPC_META_SPLIT) {
             ir3_set_dst_type(ssa(dst->srcs[0]), true);
             ir3_fixup_src_type(ssa(dst->srcs[0]));
             dst->srcs[0]->flags |= IR3_REG_HALF;
          }
       }
    }

    if (!dst->is_ssa) {
       nir_register *reg = dst->reg.reg;
       struct ir3_array *arr = ir3_get_array(ctx, reg);
       unsigned num_components = ctx->last_dst_n;
       struct ir3_instruction *addr = NULL;

       if (dst->reg.indirect)
          addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
                               reg->num_components);

       for (unsigned i = 0; i < num_components; i++) {
          unsigned n = dst->reg.base_offset * reg->num_components + i;
          compile_assert(ctx, n < arr->length);
          if (!ctx->last_dst[i])
             continue;
          ir3_create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
       }

       ralloc_free(ctx->last_dst);
    }

    ctx->last_dst = NULL;
    ctx->last_dst_n = 0;
 }

 static unsigned
 dest_flags(struct ir3_instruction *instr)
 {
    return instr->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
 }

 struct ir3_instruction *
 ir3_create_collect(struct ir3_block *block, struct ir3_instruction *const *arr,
                    unsigned arrsz)
 {
    struct ir3_instruction *collect;

    if (arrsz == 0)
       return NULL;

    unsigned flags = dest_flags(arr[0]);

    collect = ir3_instr_create(block, OPC_META_COLLECT, 1, arrsz);
    __ssa_dst(collect)->flags |= flags;
    for (unsigned i = 0; i < arrsz; i++) {
       struct ir3_instruction *elem = arr[i];

       /* Since arrays are pre-colored in RA, we can't assume that
        * things will end up in the right place.  (Ie. if a collect
        * joins elements from two different arrays.)  So insert an
        * extra mov.
        *
        * We could possibly skip this if all the collected elements
        * are contiguous elements in a single array.. not sure how
        * likely that is to happen.
        *
        * Fixes a problem with glamor shaders, that in effect do
        * something like:
        *
        *   if (foo)
        *     texcoord = ..
        *   else
        *     texcoord = ..
        *   color = texture2D(tex, texcoord);
        *
        * In this case, texcoord will end up as nir registers (which
        * translate to ir3 array's of length 1.  And we can't assume
        * the two (or more) arrays will get allocated in consecutive
        * scalar registers.
        *
        */
       if (elem->dsts[0]->flags & IR3_REG_ARRAY) {
          type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
          elem = ir3_MOV(block, elem, type);
       }

       assert(dest_flags(elem) == flags);
       __ssa_src(collect, elem, flags);
    }

    collect->dsts[0]->wrmask = MASK(arrsz);

    return collect;
 }

 /* helper for instructions that produce multiple consecutive scalar
  * outputs which need to have a split meta instruction inserted
  */
 void
 ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
                struct ir3_instruction *src, unsigned base, unsigned n)
 {
    if ((n == 1) && (src->dsts[0]->wrmask == 0x1) &&
        /* setup_input needs ir3_split_dest to generate a SPLIT instruction */
        src->opc != OPC_META_INPUT) {
       dst[0] = src;
       return;
    }

    if (src->opc == OPC_META_COLLECT) {
       assert((base + n) <= src->srcs_count);

       for (int i = 0; i < n; i++) {
          dst[i] = ssa(src->srcs[i + base]);
       }

       return;
    }

    unsigned flags = dest_flags(src);

    for (int i = 0, j = 0; i < n; i++) {
       struct ir3_instruction *split =
          ir3_instr_create(block, OPC_META_SPLIT, 1, 1);
       __ssa_dst(split)->flags |= flags;
       __ssa_src(split, src, flags);
       split->split.off = i + base;

       if (src->dsts[0]->wrmask & (1 << (i + base)))
          dst[j++] = split;
    }
 }

 NORETURN void
 ir3_context_error(struct ir3_context *ctx, const char *format, ...)
 {
    struct hash_table *errors = NULL;
    va_list ap;
    va_start(ap, format);
    if (ctx->cur_instr) {
       errors = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                        _mesa_key_pointer_equal);
       char *msg = ralloc_vasprintf(errors, format, ap);
       _mesa_hash_table_insert(errors, ctx->cur_instr, msg);
    } else {
       mesa_loge_v(format, ap);
    }
    va_end(ap);
    nir_log_shader_annotated(ctx->s, errors);
    ralloc_free(errors);
    ctx->error = true;
    unreachable("");
 }

 static struct ir3_instruction *
 create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
 {
    struct ir3_instruction *instr, *immed;

    instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);

    switch (align) {
    case 1:
       /* src *= 1: */
       break;
    case 2:
       /* src *= 2	=> src <<= 1: */
       immed = create_immed_typed(block, 1, TYPE_S16);
       instr = ir3_SHL_B(block, instr, 0, immed, 0);
       break;
    case 3:
       /* src *= 3: */
       immed = create_immed_typed(block, 3, TYPE_S16);
       instr = ir3_MULL_U(block, instr, 0, immed, 0);
       break;
    case 4:
       /* src *= 4 => src <<= 2: */
       immed = create_immed_typed(block, 2, TYPE_S16);
       instr = ir3_SHL_B(block, instr, 0, immed, 0);
       break;
    default:
       unreachable("bad align");
       return NULL;
    }

    instr->dsts[0]->flags |= IR3_REG_HALF;

    instr = ir3_MOV(block, instr, TYPE_S16);
    instr->dsts[0]->num = regid(REG_A0, 0);

    return instr;
 }

 static struct ir3_instruction *
 create_addr1(struct ir3_block *block, unsigned const_val)
 {
    struct ir3_instruction *immed =
       create_immed_typed(block, const_val, TYPE_U16);
    struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_U16);
    instr->dsts[0]->num = regid(REG_A0, 1);
    return instr;
 }

 /* caches addr values to avoid generating multiple cov/shl/mova
  * sequences for each use of a given NIR level src as address
  */
 struct ir3_instruction *
 ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align)
 {
    struct ir3_instruction *addr;
    unsigned idx = align - 1;

    compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));

    if (!ctx->addr0_ht[idx]) {
       ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx, _mesa_hash_pointer,
                                                    _mesa_key_pointer_equal);
    } else {
       struct hash_entry *entry;
       entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
       if (entry)
          return entry->data;
    }

    addr = create_addr0(ctx->block, src, align);
    _mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);

    return addr;
 }

 /* Similar to ir3_get_addr0, but for a1.x. */
 struct ir3_instruction *
 ir3_get_addr1(struct ir3_context *ctx, unsigned const_val)
 {
    struct ir3_instruction *addr;

    if (!ctx->addr1_ht) {
       ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
    } else {
       addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
       if (addr)
          return addr;
    }

    addr = create_addr1(ctx->block, const_val);
    _mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);

    return addr;
 }

 struct ir3_instruction *
 ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
 {
    struct ir3_block *b = ctx->block;
    struct ir3_instruction *cond;

    /* NOTE: only cmps.*.* can write p0.x: */
    struct ir3_instruction *zero =
          create_immed_typed(b, 0, is_half(src) ? TYPE_U16 : TYPE_U32);
    cond = ir3_CMPS_S(b, src, 0, zero, 0);
    cond->cat2.condition = IR3_COND_NE;

    /* condition always goes in predicate register: */
    cond->dsts[0]->num = regid(REG_P0, 0);
    cond->dsts[0]->flags &= ~IR3_REG_SSA;

    return cond;
 }

 /*
  * Array helpers
  */

 void
 ir3_declare_array(struct ir3_context *ctx, nir_register *reg)
 {
    struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
    arr->id = ++ctx->num_arrays;
    /* NOTE: sometimes we get non array regs, for example for arrays of
     * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
     * treat a non-array as if it was an array of length 1.
     *
     * It would be nice if there was a nir pass to convert arrays of
     * length 1 to ssa.
     */
    arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
    compile_assert(ctx, arr->length > 0);
    arr->r = reg;
    arr->half = ir3_bitsize(ctx, reg->bit_size) <= 16;
    list_addtail(&arr->node, &ctx->ir->array_list);
 }

 struct ir3_array *
 ir3_get_array(struct ir3_context *ctx, nir_register *reg)
 {
    foreach_array (arr, &ctx->ir->array_list) {
       if (arr->r == reg)
          return arr;
    }
    ir3_context_error(ctx, "bogus reg: r%d\n", reg->index);
    return NULL;
 }

 /* relative (indirect) if address!=NULL */
 struct ir3_instruction *
 ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
                       struct ir3_instruction *address)
 {
    struct ir3_block *block = ctx->block;
    struct ir3_instruction *mov;
    struct ir3_register *src;
    unsigned flags = 0;

    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
    if (arr->half) {
       mov->cat1.src_type = TYPE_U16;
       mov->cat1.dst_type = TYPE_U16;
       flags |= IR3_REG_HALF;
    } else {
       mov->cat1.src_type = TYPE_U32;
       mov->cat1.dst_type = TYPE_U32;
    }

    mov->barrier_class = IR3_BARRIER_ARRAY_R;
    mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
    __ssa_dst(mov)->flags |= flags;
    src = ir3_src_create(mov, 0,
                         IR3_REG_ARRAY | COND(address, IR3_REG_RELATIV) | flags);
    src->def = (arr->last_write && arr->last_write->instr->block == block)
                  ? arr->last_write
                  : NULL;
    src->size = arr->length;
    src->array.id = arr->id;
    src->array.offset = n;
    src->array.base = INVALID_REG;

    if (address)
       ir3_instr_set_address(mov, address);

    return mov;
 }

 /* relative (indirect) if address!=NULL */
 void
 ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
                        struct ir3_instruction *src,
                        struct ir3_instruction *address)
 {
    struct ir3_block *block = ctx->block;
    struct ir3_instruction *mov;
    struct ir3_register *dst;
    unsigned flags = 0;

    /* if not relative store, don't create an extra mov, since that
     * ends up being difficult for cp to remove.
     *
     * Also, don't skip the mov if the src is meta (like fanout/split),
     * since that creates a situation that RA can't really handle properly.
     */
    if (!address && !is_meta(src)) {
       dst = src->dsts[0];

       src->barrier_class |= IR3_BARRIER_ARRAY_W;
       src->barrier_conflict |= IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;

       dst->flags |= IR3_REG_ARRAY;
       dst->size = arr->length;
       dst->array.id = arr->id;
       dst->array.offset = n;
       dst->array.base = INVALID_REG;

       if (arr->last_write && arr->last_write->instr->block == src->block)
          ir3_reg_set_last_array(src, dst, arr->last_write);

       arr->last_write = dst;

       array_insert(block, block->keeps, src);

       return;
    }

    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
    if (arr->half) {
       mov->cat1.src_type = TYPE_U16;
       mov->cat1.dst_type = TYPE_U16;
       flags |= IR3_REG_HALF;
    } else {
       mov->cat1.src_type = TYPE_U32;
       mov->cat1.dst_type = TYPE_U32;
    }
    mov->barrier_class = IR3_BARRIER_ARRAY_W;
    mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
    dst = ir3_dst_create(
       mov, 0,
       IR3_REG_SSA | IR3_REG_ARRAY | flags | COND(address, IR3_REG_RELATIV));
    dst->instr = mov;
    dst->size = arr->length;
    dst->array.id = arr->id;
    dst->array.offset = n;
    dst->array.base = INVALID_REG;
    ir3_src_create(mov, 0, IR3_REG_SSA | flags)->def = src->dsts[0];

    if (arr->last_write && arr->last_write->instr->block == block)
       ir3_reg_set_last_array(mov, dst, arr->last_write);

    if (address)
       ir3_instr_set_address(mov, address);

    arr->last_write = dst;

    /* the array store may only matter to something in an earlier
     * block (ie. loops), but since arrays are not in SSA, depth
     * pass won't know this.. so keep all array stores:
     */
    array_insert(block, block->keeps, mov);
 }
	/*
	* Copyright (C) 2015-2018 Rob Clark <robclark@freedesktop.org>
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*
	* Authors:
	* Rob Clark <robclark@freedesktop.org>
	*/

	#include "ir3_context.h"
	#include "ir3_compiler.h"
	#include "ir3_image.h"
	#include "ir3_nir.h"
	#include "ir3_shader.h"

	struct ir3_context *
	ir3_context_init(struct ir3_compiler compiler, struct ir3_shader shader,
	struct ir3_shader_variant *so)
	{
	struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);

	if (compiler->gen == 4) {
	if (so->type == MESA_SHADER_VERTEX) {
	ctx->astc_srgb = so->key.vastc_srgb;
	memcpy(ctx->sampler_swizzles, so->key.vsampler_swizzles, sizeof(ctx->sampler_swizzles));
	} else if (so->type == MESA_SHADER_FRAGMENT \|\|
	so->type == MESA_SHADER_COMPUTE) {
	ctx->astc_srgb = so->key.fastc_srgb;
	memcpy(ctx->sampler_swizzles, so->key.fsampler_swizzles, sizeof(ctx->sampler_swizzles));
	}
	} else if (compiler->gen == 3) {
	if (so->type == MESA_SHADER_VERTEX) {
	ctx->samples = so->key.vsamples;
	} else if (so->type == MESA_SHADER_FRAGMENT) {
	ctx->samples = so->key.fsamples;
	}
	}

	if (compiler->gen >= 6) {
	ctx->funcs = &ir3_a6xx_funcs;
	} else if (compiler->gen >= 4) {
	ctx->funcs = &ir3_a4xx_funcs;
	}

	ctx->compiler = compiler;
	ctx->so = so;
	ctx->def_ht =
	_mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
	ctx->block_ht =
	_mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
	ctx->continue_block_ht =
	_mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
	ctx->sel_cond_conversions =
	_mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);

	/* TODO: maybe generate some sort of bitmask of what key
	* lowers vs what shader has (ie. no need to lower
	* texture clamp lowering if no texture sample instrs)..
	* although should be done further up the stack to avoid
	* creating duplicate variants..
	*/

	ctx->s = nir_shader_clone(ctx, shader->nir);
	ir3_nir_lower_variant(so, ctx->s);

	/* this needs to be the last pass run, so do this here instead of
	* in ir3_optimize_nir():
	*/
	bool progress = false;
	NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs);

	/* we could need cleanup after lower_locals_to_regs */
	while (progress) {
	progress = false;
	NIR_PASS(progress, ctx->s, nir_opt_algebraic);
	NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
	}

	/* We want to lower nir_op_imul as late as possible, to catch also
	* those generated by earlier passes (e.g, nir_lower_locals_to_regs).
	* However, we want a final swing of a few passes to have a chance
	* at optimizing the result.
	*/
	progress = false;
	NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
	while (progress) {
	progress = false;
	NIR_PASS(progress, ctx->s, nir_opt_algebraic);
	NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars);
	NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars);
	NIR_PASS(progress, ctx->s, nir_opt_dce);
	NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
	}

	/* Enable the texture pre-fetch feature only a4xx onwards. But
	* only enable it on generations that have been tested:
	*/
	if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gen >= 6))
	NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);

	NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);

	/* Super crude heuristic to limit # of tex prefetch in small
	* shaders. This completely ignores loops.. but that's really
	* not the worst of it's problems. (A frag shader that has
	* loops is probably going to be big enough to not trigger a
	* lower threshold.)
	*
	* 1) probably want to do this in terms of ir3 instructions
	* 2) probably really want to decide this after scheduling
	* (or at least pre-RA sched) so we have a rough idea about
	* nops, and don't count things that get cp'd away
	* 3) blob seems to use higher thresholds with a mix of more
	* SFU instructions. Which partly makes sense, more SFU
	* instructions probably means you want to get the real
	* shader started sooner, but that considers where in the
	* shader the SFU instructions are, which blob doesn't seem
	* to do.
	*
	* This uses more conservative thresholds assuming a more alu
	* than sfu heavy instruction mix.
	*/
	if (so->type == MESA_SHADER_FRAGMENT) {
	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);

	unsigned instruction_count = 0;
	nir_foreach_block (block, fxn) {
	instruction_count += exec_list_length(&block->instr_list);
	}

	if (instruction_count < 50) {
	ctx->prefetch_limit = 2;
	} else if (instruction_count < 70) {
	ctx->prefetch_limit = 3;
	} else {
	ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
	}
	}

	if (shader_debug_enabled(so->type, ctx->s->info.internal)) {
	mesa_logi("NIR (final form) for %s shader %s:", ir3_shader_stage(so),
	so->name);
	nir_log_shaderi(ctx->s);
	}

	ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures);

	return ctx;
	}

	void
	ir3_context_free(struct ir3_context *ctx)
	{
	ralloc_free(ctx);
	}

	/*
	* Misc helpers
	*/

	/* allocate a n element value array (to be populated by caller) and
	* insert in def_ht
	*/
	struct ir3_instruction **
	ir3_get_dst_ssa(struct ir3_context ctx, nir_ssa_def dst, unsigned n)
	{
	struct ir3_instruction **value =
	ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
	_mesa_hash_table_insert(ctx->def_ht, dst, value);
	return value;
	}

	struct ir3_instruction **
	ir3_get_dst(struct ir3_context ctx, nir_dest dst, unsigned n)
	{
	struct ir3_instruction **value;

	if (dst->is_ssa) {
	value = ir3_get_dst_ssa(ctx, &dst->ssa, n);
	} else {
	value = ralloc_array(ctx, struct ir3_instruction *, n);
	}

	/* NOTE: in non-ssa case, we don't really need to store last_dst
	* but this helps us catch cases where put_dst() call is forgotten
	*/
	compile_assert(ctx, !ctx->last_dst);
	ctx->last_dst = value;
	ctx->last_dst_n = n;

	return value;
	}

	struct ir3_instruction const
	ir3_get_src(struct ir3_context ctx, nir_src src)
	{
	if (src->is_ssa) {
	struct hash_entry *entry;
	entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
	compile_assert(ctx, entry);
	return entry->data;
	} else {
	nir_register *reg = src->reg.reg;
	struct ir3_array *arr = ir3_get_array(ctx, reg);
	unsigned num_components = arr->r->num_components;
	struct ir3_instruction *addr = NULL;
	struct ir3_instruction **value =
	ralloc_array(ctx, struct ir3_instruction *, num_components);

	if (src->reg.indirect)
	addr = ir3_get_addr0(ctx, ir3_get_src(ctx, src->reg.indirect)[0],
	reg->num_components);

	for (unsigned i = 0; i < num_components; i++) {
	unsigned n = src->reg.base_offset * reg->num_components + i;
	compile_assert(ctx, n < arr->length);
	value[i] = ir3_create_array_load(ctx, arr, n, addr);
	}

	return value;
	}
	}

	void
	ir3_put_dst(struct ir3_context ctx, nir_dest dst)
	{
	unsigned bit_size = ir3_bitsize(ctx, nir_dest_bit_size(*dst));

	/* add extra mov if dst value is shared reg.. in some cases not all
	* instructions can read from shared regs, in cases where they can
	* ir3_cp will clean up the extra mov:
	*/
	for (unsigned i = 0; i < ctx->last_dst_n; i++) {
	if (!ctx->last_dst[i])
	continue;
	if (ctx->last_dst[i]->dsts[0]->flags & IR3_REG_SHARED) {
	ctx->last_dst[i] = ir3_MOV(ctx->block, ctx->last_dst[i], TYPE_U32);
	}
	}

	if (bit_size <= 16) {
	for (unsigned i = 0; i < ctx->last_dst_n; i++) {
	struct ir3_instruction *dst = ctx->last_dst[i];
	ir3_set_dst_type(dst, true);
	ir3_fixup_src_type(dst);
	if (dst->opc == OPC_META_SPLIT) {
	ir3_set_dst_type(ssa(dst->srcs[0]), true);
	ir3_fixup_src_type(ssa(dst->srcs[0]));
	dst->srcs[0]->flags \|= IR3_REG_HALF;
	}
	}
	}

	if (!dst->is_ssa) {
	nir_register *reg = dst->reg.reg;
	struct ir3_array *arr = ir3_get_array(ctx, reg);
	unsigned num_components = ctx->last_dst_n;
	struct ir3_instruction *addr = NULL;

	if (dst->reg.indirect)
	addr = ir3_get_addr0(ctx, ir3_get_src(ctx, dst->reg.indirect)[0],
	reg->num_components);

	for (unsigned i = 0; i < num_components; i++) {
	unsigned n = dst->reg.base_offset * reg->num_components + i;
	compile_assert(ctx, n < arr->length);
	if (!ctx->last_dst[i])
	continue;
	ir3_create_array_store(ctx, arr, n, ctx->last_dst[i], addr);
	}

	ralloc_free(ctx->last_dst);
	}

	ctx->last_dst = NULL;
	ctx->last_dst_n = 0;
	}

	static unsigned
	dest_flags(struct ir3_instruction *instr)
	{
	return instr->dsts[0]->flags & (IR3_REG_HALF \| IR3_REG_SHARED);
	}

	struct ir3_instruction *
	ir3_create_collect(struct ir3_block block, struct ir3_instruction const *arr,
	unsigned arrsz)
	{
	struct ir3_instruction *collect;

	if (arrsz == 0)
	return NULL;

	unsigned flags = dest_flags(arr[0]);

	collect = ir3_instr_create(block, OPC_META_COLLECT, 1, arrsz);
	__ssa_dst(collect)->flags \|= flags;
	for (unsigned i = 0; i < arrsz; i++) {
	struct ir3_instruction *elem = arr[i];

	/* Since arrays are pre-colored in RA, we can't assume that
	* things will end up in the right place. (Ie. if a collect
	* joins elements from two different arrays.) So insert an
	* extra mov.
	*
	* We could possibly skip this if all the collected elements
	* are contiguous elements in a single array.. not sure how
	* likely that is to happen.
	*
	* Fixes a problem with glamor shaders, that in effect do
	* something like:
	*
	* if (foo)
	* texcoord = ..
	* else
	* texcoord = ..
	* color = texture2D(tex, texcoord);
	*
	* In this case, texcoord will end up as nir registers (which
	* translate to ir3 array's of length 1. And we can't assume
	* the two (or more) arrays will get allocated in consecutive
	* scalar registers.
	*
	*/
	if (elem->dsts[0]->flags & IR3_REG_ARRAY) {
	type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
	elem = ir3_MOV(block, elem, type);
	}

	assert(dest_flags(elem) == flags);
	__ssa_src(collect, elem, flags);
	}

	collect->dsts[0]->wrmask = MASK(arrsz);

	return collect;
	}

	/* helper for instructions that produce multiple consecutive scalar
	* outputs which need to have a split meta instruction inserted
	*/
	void
	ir3_split_dest(struct ir3_block block, struct ir3_instruction *dst,
	struct ir3_instruction *src, unsigned base, unsigned n)
	{
	if ((n == 1) && (src->dsts[0]->wrmask == 0x1) &&
	/* setup_input needs ir3_split_dest to generate a SPLIT instruction */
	src->opc != OPC_META_INPUT) {
	dst[0] = src;
	return;
	}

	if (src->opc == OPC_META_COLLECT) {
	assert((base + n) <= src->srcs_count);

	for (int i = 0; i < n; i++) {
	dst[i] = ssa(src->srcs[i + base]);
	}

	return;
	}

	unsigned flags = dest_flags(src);

	for (int i = 0, j = 0; i < n; i++) {
	struct ir3_instruction *split =
	ir3_instr_create(block, OPC_META_SPLIT, 1, 1);
	__ssa_dst(split)->flags \|= flags;
	__ssa_src(split, src, flags);
	split->split.off = i + base;

	if (src->dsts[0]->wrmask & (1 << (i + base)))
	dst[j++] = split;
	}
	}

	NORETURN void
	ir3_context_error(struct ir3_context ctx, const char format, ...)
	{
	struct hash_table *errors = NULL;
	va_list ap;
	va_start(ap, format);
	if (ctx->cur_instr) {
	errors = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
	_mesa_key_pointer_equal);
	char *msg = ralloc_vasprintf(errors, format, ap);
	_mesa_hash_table_insert(errors, ctx->cur_instr, msg);
	} else {
	mesa_loge_v(format, ap);
	}
	va_end(ap);
	nir_log_shader_annotated(ctx->s, errors);
	ralloc_free(errors);
	ctx->error = true;
	unreachable("");
	}

	static struct ir3_instruction *
	create_addr0(struct ir3_block block, struct ir3_instruction src, int align)
	{
	struct ir3_instruction instr, immed;

	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);

	switch (align) {
	case 1:
	/* src = 1: /
	break;
	case 2:
	/* src = 2 => src <<= 1: /
	immed = create_immed_typed(block, 1, TYPE_S16);
	instr = ir3_SHL_B(block, instr, 0, immed, 0);
	break;
	case 3:
	/* src = 3: /
	immed = create_immed_typed(block, 3, TYPE_S16);
	instr = ir3_MULL_U(block, instr, 0, immed, 0);
	break;
	case 4:
	/* src = 4 => src <<= 2: /
	immed = create_immed_typed(block, 2, TYPE_S16);
	instr = ir3_SHL_B(block, instr, 0, immed, 0);
	break;
	default:
	unreachable("bad align");
	return NULL;
	}

	instr->dsts[0]->flags \|= IR3_REG_HALF;

	instr = ir3_MOV(block, instr, TYPE_S16);
	instr->dsts[0]->num = regid(REG_A0, 0);

	return instr;
	}

	static struct ir3_instruction *
	create_addr1(struct ir3_block *block, unsigned const_val)
	{
	struct ir3_instruction *immed =
	create_immed_typed(block, const_val, TYPE_U16);
	struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_U16);
	instr->dsts[0]->num = regid(REG_A0, 1);
	return instr;
	}

	/* caches addr values to avoid generating multiple cov/shl/mova
	* sequences for each use of a given NIR level src as address
	*/
	struct ir3_instruction *
	ir3_get_addr0(struct ir3_context ctx, struct ir3_instruction src, int align)
	{
	struct ir3_instruction *addr;
	unsigned idx = align - 1;

	compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));

	if (!ctx->addr0_ht[idx]) {
	ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx, _mesa_hash_pointer,
	_mesa_key_pointer_equal);
	} else {
	struct hash_entry *entry;
	entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
	if (entry)
	return entry->data;
	}

	addr = create_addr0(ctx->block, src, align);
	_mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);

	return addr;
	}

	/* Similar to ir3_get_addr0, but for a1.x. */
	struct ir3_instruction *
	ir3_get_addr1(struct ir3_context *ctx, unsigned const_val)
	{
	struct ir3_instruction *addr;

	if (!ctx->addr1_ht) {
	ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
	} else {
	addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
	if (addr)
	return addr;
	}

	addr = create_addr1(ctx->block, const_val);
	_mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);

	return addr;
	}

	struct ir3_instruction *
	ir3_get_predicate(struct ir3_context ctx, struct ir3_instruction src)
	{
	struct ir3_block *b = ctx->block;
	struct ir3_instruction *cond;

	/* NOTE: only cmps.. can write p0.x: */
	struct ir3_instruction *zero =
	create_immed_typed(b, 0, is_half(src) ? TYPE_U16 : TYPE_U32);
	cond = ir3_CMPS_S(b, src, 0, zero, 0);
	cond->cat2.condition = IR3_COND_NE;

	/* condition always goes in predicate register: */
	cond->dsts[0]->num = regid(REG_P0, 0);
	cond->dsts[0]->flags &= ~IR3_REG_SSA;

	return cond;
	}

	/*
	* Array helpers
	*/

	void
	ir3_declare_array(struct ir3_context ctx, nir_register reg)
	{
	struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
	arr->id = ++ctx->num_arrays;
	/* NOTE: sometimes we get non array regs, for example for arrays of
	* length 1. See fs-const-array-of-struct-of-array.shader_test. So
	* treat a non-array as if it was an array of length 1.
	*
	* It would be nice if there was a nir pass to convert arrays of
	* length 1 to ssa.
	*/
	arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
	compile_assert(ctx, arr->length > 0);
	arr->r = reg;
	arr->half = ir3_bitsize(ctx, reg->bit_size) <= 16;
	list_addtail(&arr->node, &ctx->ir->array_list);
	}

	struct ir3_array *
	ir3_get_array(struct ir3_context ctx, nir_register reg)
	{
	foreach_array (arr, &ctx->ir->array_list) {
	if (arr->r == reg)
	return arr;
	}
	ir3_context_error(ctx, "bogus reg: r%d\n", reg->index);
	return NULL;
	}

	/* relative (indirect) if address!=NULL */
	struct ir3_instruction *
	ir3_create_array_load(struct ir3_context ctx, struct ir3_array arr, int n,
	struct ir3_instruction *address)
	{
	struct ir3_block *block = ctx->block;
	struct ir3_instruction *mov;
	struct ir3_register *src;
	unsigned flags = 0;

	mov = ir3_instr_create(block, OPC_MOV, 1, 1);
	if (arr->half) {
	mov->cat1.src_type = TYPE_U16;
	mov->cat1.dst_type = TYPE_U16;
	flags \|= IR3_REG_HALF;
	} else {
	mov->cat1.src_type = TYPE_U32;
	mov->cat1.dst_type = TYPE_U32;
	}

	mov->barrier_class = IR3_BARRIER_ARRAY_R;
	mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
	__ssa_dst(mov)->flags \|= flags;
	src = ir3_src_create(mov, 0,
	IR3_REG_ARRAY \| COND(address, IR3_REG_RELATIV) \| flags);
	src->def = (arr->last_write && arr->last_write->instr->block == block)
	? arr->last_write
	: NULL;
	src->size = arr->length;
	src->array.id = arr->id;
	src->array.offset = n;
	src->array.base = INVALID_REG;

	if (address)
	ir3_instr_set_address(mov, address);

	return mov;
	}

	/* relative (indirect) if address!=NULL */
	void
	ir3_create_array_store(struct ir3_context ctx, struct ir3_array arr, int n,
	struct ir3_instruction *src,
	struct ir3_instruction *address)
	{
	struct ir3_block *block = ctx->block;
	struct ir3_instruction *mov;
	struct ir3_register *dst;
	unsigned flags = 0;

	/* if not relative store, don't create an extra mov, since that
	* ends up being difficult for cp to remove.
	*
	* Also, don't skip the mov if the src is meta (like fanout/split),
	* since that creates a situation that RA can't really handle properly.
	*/
	if (!address && !is_meta(src)) {
	dst = src->dsts[0];

	src->barrier_class \|= IR3_BARRIER_ARRAY_W;
	src->barrier_conflict \|= IR3_BARRIER_ARRAY_R \| IR3_BARRIER_ARRAY_W;

	dst->flags \|= IR3_REG_ARRAY;
	dst->size = arr->length;
	dst->array.id = arr->id;
	dst->array.offset = n;
	dst->array.base = INVALID_REG;

	if (arr->last_write && arr->last_write->instr->block == src->block)
	ir3_reg_set_last_array(src, dst, arr->last_write);

	arr->last_write = dst;

	array_insert(block, block->keeps, src);

	return;
	}

	mov = ir3_instr_create(block, OPC_MOV, 1, 1);
	if (arr->half) {
	mov->cat1.src_type = TYPE_U16;
	mov->cat1.dst_type = TYPE_U16;
	flags \|= IR3_REG_HALF;
	} else {
	mov->cat1.src_type = TYPE_U32;
	mov->cat1.dst_type = TYPE_U32;
	}
	mov->barrier_class = IR3_BARRIER_ARRAY_W;
	mov->barrier_conflict = IR3_BARRIER_ARRAY_R \| IR3_BARRIER_ARRAY_W;
	dst = ir3_dst_create(
	mov, 0,
	IR3_REG_SSA \| IR3_REG_ARRAY \| flags \| COND(address, IR3_REG_RELATIV));
	dst->instr = mov;
	dst->size = arr->length;
	dst->array.id = arr->id;
	dst->array.offset = n;
	dst->array.base = INVALID_REG;
	ir3_src_create(mov, 0, IR3_REG_SSA \| flags)->def = src->dsts[0];

	if (arr->last_write && arr->last_write->instr->block == block)
	ir3_reg_set_last_array(mov, dst, arr->last_write);

	if (address)
	ir3_instr_set_address(mov, address);

	arr->last_write = dst;

	/* the array store may only matter to something in an earlier
	* block (ie. loops), but since arrays are not in SSA, depth
	* pass won't know this.. so keep all array stores:
	*/
	array_insert(block, block->keeps, mov);
	}