src/gallium/drivers/etnaviv/etnaviv_compiler.c - third_party/mesa - Git at Google

 /*
  * Copyright (c) 2012-2015 Etnaviv Project
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sub license,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
  * Authors:
  *    Wladimir J. van der Laan <laanwj@gmail.com>
  */

 /* TGSI->Vivante shader ISA conversion */

 /* What does the compiler return (see etna_shader_object)?
  *  1) instruction data
  *  2) input-to-temporary mapping (fixed for ps)
  *      *) in case of ps, semantic -> varying id mapping
  *      *) for each varying: number of components used (r, rg, rgb, rgba)
  *  3) temporary-to-output mapping (in case of vs, fixed for ps)
  *  4) for each input/output: possible semantic (position, color, glpointcoord, ...)
  *  5) immediates base offset, immediates data
  *  6) used texture units (and possibly the TGSI_TEXTURE_* type); not needed to
  *     configure the hw, but useful for error checking
  *  7) enough information to add the z=(z+w)/2.0 necessary for older chips
  *     (output reg id is enough)
  *
  *  Empty shaders are not allowed, should always at least generate a NOP. Also
  *  if there is a label at the end of the shader, an extra NOP should be
  *  generated as jump target.
  *
  * TODO
  * * Use an instruction scheduler
  * * Indirect access to uniforms / temporaries using amode
  */

 #include "etnaviv_compiler.h"

 #include "etnaviv_asm.h"
 #include "etnaviv_context.h"
 #include "etnaviv_debug.h"
 #include "etnaviv_disasm.h"
 #include "etnaviv_uniforms.h"
 #include "etnaviv_util.h"

 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_iterate.h"
 #include "tgsi/tgsi_lowering.h"
 #include "tgsi/tgsi_strings.h"
 #include "tgsi/tgsi_util.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"

 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <sys/types.h>

 #define ETNA_MAX_INNER_TEMPS 2

 static const float sincos_const[2][4] = {
    {
       2., -1., 4., -4.,
    },
    {
       1. / (2. * M_PI), 0.75, 0.5, 0.0,
    },
 };

 /* Native register description structure */
 struct etna_native_reg {
    unsigned valid : 1;
    unsigned is_tex : 1; /* is texture unit, overrides rgroup */
    unsigned rgroup : 3;
    unsigned id : 9;
 };

 /* Register description */
 struct etna_reg_desc {
    enum tgsi_file_type file; /* IN, OUT, TEMP, ... */
    int idx; /* index into file */
    bool active; /* used in program */
    int first_use; /* instruction id of first use (scope begin) */
    int last_use; /* instruction id of last use (scope end, inclusive) */

    struct etna_native_reg native; /* native register to map to */
    unsigned usage_mask : 4; /* usage, per channel */
    bool has_semantic; /* register has associated TGSI semantic */
    struct tgsi_declaration_semantic semantic; /* TGSI semantic */
    struct tgsi_declaration_interp interp; /* Interpolation type */
 };

 /* Label information structure */
 struct etna_compile_label {
    int inst_idx; /* Instruction id that label points to */
 };

 enum etna_compile_frame_type {
    ETNA_COMPILE_FRAME_IF, /* IF/ELSE/ENDIF */
    ETNA_COMPILE_FRAME_LOOP,
 };

 /* nesting scope frame (LOOP, IF, ...) during compilation
  */
 struct etna_compile_frame {
    enum etna_compile_frame_type type;
    int lbl_else_idx;
    int lbl_endif_idx;
    int lbl_loop_bgn_idx;
    int lbl_loop_end_idx;
 };

 struct etna_compile_file {
    /* Number of registers in each TGSI file (max register+1) */
    size_t reg_size;
    /* Register descriptions, per register index */
    struct etna_reg_desc *reg;
 };

 #define array_insert(arr, val)                          \
    do {                                                 \
       if (arr##_count == arr##_sz) {                    \
          arr##_sz = MAX2(2 * arr##_sz, 16);             \
          arr = realloc(arr, arr##_sz * sizeof(arr[0])); \
       }                                                 \
       arr[arr##_count++] = val;                         \
    } while (0)


 /* scratch area for compiling shader, freed after compilation finishes */
 struct etna_compile {
    const struct tgsi_token *tokens;
    bool free_tokens;

    struct tgsi_shader_info info;

    /* Register descriptions, per TGSI file, per register index */
    struct etna_compile_file file[TGSI_FILE_COUNT];

    /* Keep track of TGSI register declarations */
    struct etna_reg_desc decl[ETNA_MAX_DECL];
    uint total_decls;

    /* Bitmap of dead instructions which are removed in a separate pass */
    bool dead_inst[ETNA_MAX_TOKENS];

    /* Immediate data */
    enum etna_immediate_contents imm_contents[ETNA_MAX_IMM];
    uint32_t imm_data[ETNA_MAX_IMM];
    uint32_t imm_base; /* base of immediates (in 32 bit units) */
    uint32_t imm_size; /* size of immediates (in 32 bit units) */

    /* Next free native register, for register allocation */
    uint32_t next_free_native;

    /* Temporary register for use within translated TGSI instruction,
     * only allocated when needed.
     */
    int inner_temps; /* number of inner temps used; only up to one available at
                        this point */
    struct etna_native_reg inner_temp[ETNA_MAX_INNER_TEMPS];

    /* Fields for handling nested conditionals */
    struct etna_compile_frame frame_stack[ETNA_MAX_DEPTH];
    int frame_sp;
    int lbl_usage[ETNA_MAX_INSTRUCTIONS];

    unsigned labels_count, labels_sz;
    struct etna_compile_label *labels;

    unsigned num_loops;

    /* Code generation */
    int inst_ptr; /* current instruction pointer */
    uint32_t code[ETNA_MAX_INSTRUCTIONS * ETNA_INST_SIZE];

    /* I/O */

    /* Number of varyings (PS only) */
    int num_varyings;

    /* GPU hardware specs */
    const struct etna_specs *specs;

    const struct etna_shader_key *key;
 };

 static struct etna_reg_desc *
 etna_get_dst_reg(struct etna_compile *c, struct tgsi_dst_register dst)
 {
    return &c->file[dst.File].reg[dst.Index];
 }

 static struct etna_reg_desc *
 etna_get_src_reg(struct etna_compile *c, struct tgsi_src_register src)
 {
    return &c->file[src.File].reg[src.Index];
 }

 static struct etna_native_reg
 etna_native_temp(unsigned reg)
 {
    return (struct etna_native_reg) {
       .valid = 1,
       .rgroup = INST_RGROUP_TEMP,
       .id = reg
    };
 }

 /** Register allocation **/
 enum reg_sort_order {
    FIRST_USE_ASC,
    FIRST_USE_DESC,
    LAST_USE_ASC,
    LAST_USE_DESC
 };

 /* Augmented register description for sorting */
 struct sort_rec {
    struct etna_reg_desc *ptr;
    int key;
 };

 static int
 sort_rec_compar(const struct sort_rec *a, const struct sort_rec *b)
 {
    if (a->key < b->key)
       return -1;

    if (a->key > b->key)
       return 1;

    return 0;
 }

 /* create an index on a register set based on certain criteria. */
 static int
 sort_registers(struct sort_rec *sorted, struct etna_compile_file *file,
                enum reg_sort_order so)
 {
    struct etna_reg_desc *regs = file->reg;
    int ptr = 0;

    /* pre-populate keys from active registers */
    for (int idx = 0; idx < file->reg_size; ++idx) {
       /* only interested in active registers now; will only assign inactive ones
        * if no space in active ones */
       if (regs[idx].active) {
          sorted[ptr].ptr = &regs[idx];

          switch (so) {
          case FIRST_USE_ASC:
             sorted[ptr].key = regs[idx].first_use;
             break;
          case LAST_USE_ASC:
             sorted[ptr].key = regs[idx].last_use;
             break;
          case FIRST_USE_DESC:
             sorted[ptr].key = -regs[idx].first_use;
             break;
          case LAST_USE_DESC:
             sorted[ptr].key = -regs[idx].last_use;
             break;
          }
          ptr++;
       }
    }

    /* sort index by key */
    qsort(sorted, ptr, sizeof(struct sort_rec),
          (int (*)(const void *, const void *))sort_rec_compar);

    return ptr;
 }

 /* Allocate a new, unused, native temp register */
 static struct etna_native_reg
 alloc_new_native_reg(struct etna_compile *c)
 {
    assert(c->next_free_native < ETNA_MAX_TEMPS);
    return etna_native_temp(c->next_free_native++);
 }

 /* assign TEMPs to native registers */
 static void
 assign_temporaries_to_native(struct etna_compile *c,
                              struct etna_compile_file *file)
 {
    struct etna_reg_desc *temps = file->reg;

    for (int idx = 0; idx < file->reg_size; ++idx)
       temps[idx].native = alloc_new_native_reg(c);
 }

 /* assign inputs and outputs to temporaries
  * Gallium assumes that the hardware has separate registers for taking input and
  * output, however Vivante GPUs use temporaries both for passing in inputs and
  * passing back outputs.
  * Try to re-use temporary registers where possible. */
 static void
 assign_inouts_to_temporaries(struct etna_compile *c, uint file)
 {
    bool mode_inputs = (file == TGSI_FILE_INPUT);
    int inout_ptr = 0, num_inouts;
    int temp_ptr = 0, num_temps;
    struct sort_rec inout_order[ETNA_MAX_TEMPS];
    struct sort_rec temps_order[ETNA_MAX_TEMPS];
    num_inouts = sort_registers(inout_order, &c->file[file],
                                mode_inputs ? LAST_USE_ASC : FIRST_USE_ASC);
    num_temps = sort_registers(temps_order, &c->file[TGSI_FILE_TEMPORARY],
                               mode_inputs ? FIRST_USE_ASC : LAST_USE_ASC);

    while (inout_ptr < num_inouts && temp_ptr < num_temps) {
       struct etna_reg_desc *inout = inout_order[inout_ptr].ptr;
       struct etna_reg_desc *temp = temps_order[temp_ptr].ptr;

       if (!inout->active || inout->native.valid) { /* Skip if already a native register assigned */
          inout_ptr++;
          continue;
       }

       /* last usage of this input is before or in same instruction of first use
        * of temporary? */
       if (mode_inputs ? (inout->last_use <= temp->first_use)
                       : (inout->first_use >= temp->last_use)) {
          /* assign it and advance to next input */
          inout->native = temp->native;
          inout_ptr++;
       }

       temp_ptr++;
    }

    /* if we couldn't reuse current ones, allocate new temporaries */
    for (inout_ptr = 0; inout_ptr < num_inouts; ++inout_ptr) {
       struct etna_reg_desc *inout = inout_order[inout_ptr].ptr;

       if (inout->active && !inout->native.valid)
          inout->native = alloc_new_native_reg(c);
    }
 }

 /* Allocate an immediate with a certain value and return the index. If
  * there is already an immediate with that value, return that.
  */
 static struct etna_inst_src
 alloc_imm(struct etna_compile *c, enum etna_immediate_contents contents,
           uint32_t value)
 {
    int idx;

    /* Could use a hash table to speed this up */
    for (idx = 0; idx < c->imm_size; ++idx) {
       if (c->imm_contents[idx] == contents && c->imm_data[idx] == value)
          break;
    }

    /* look if there is an unused slot */
    if (idx == c->imm_size) {
       for (idx = 0; idx < c->imm_size; ++idx) {
          if (c->imm_contents[idx] == ETNA_IMMEDIATE_UNUSED)
             break;
       }
    }

    /* allocate new immediate */
    if (idx == c->imm_size) {
       assert(c->imm_size < ETNA_MAX_IMM);
       idx = c->imm_size++;
       c->imm_data[idx] = value;
       c->imm_contents[idx] = contents;
    }

    /* swizzle so that component with value is returned in all components */
    idx += c->imm_base;
    struct etna_inst_src imm_src = {
       .use = 1,
       .rgroup = INST_RGROUP_UNIFORM_0,
       .reg = idx / 4,
       .swiz = INST_SWIZ_BROADCAST(idx & 3)
    };

    return imm_src;
 }

 static struct etna_inst_src
 alloc_imm_u32(struct etna_compile *c, uint32_t value)
 {
    return alloc_imm(c, ETNA_IMMEDIATE_CONSTANT, value);
 }

 static struct etna_inst_src
 alloc_imm_vec4u(struct etna_compile *c, enum etna_immediate_contents contents,
                 const uint32_t *values)
 {
    struct etna_inst_src imm_src = { };
    int idx, i;

    for (idx = 0; idx + 3 < c->imm_size; idx += 4) {
       /* What if we can use a uniform with a different swizzle? */
       for (i = 0; i < 4; i++)
          if (c->imm_contents[idx + i] != contents || c->imm_data[idx + i] != values[i])
             break;
       if (i == 4)
          break;
    }

    if (idx + 3 >= c->imm_size) {
       idx = align(c->imm_size, 4);
       assert(idx + 4 <= ETNA_MAX_IMM);

       for (i = 0; i < 4; i++) {
          c->imm_data[idx + i] = values[i];
          c->imm_contents[idx + i] = contents;
       }

       c->imm_size = idx + 4;
    }

    assert((c->imm_base & 3) == 0);
    idx += c->imm_base;
    imm_src.use = 1;
    imm_src.rgroup = INST_RGROUP_UNIFORM_0;
    imm_src.reg = idx / 4;
    imm_src.swiz = INST_SWIZ_IDENTITY;

    return imm_src;
 }

 static uint32_t
 get_imm_u32(struct etna_compile *c, const struct etna_inst_src *imm,
             unsigned swiz_idx)
 {
    assert(imm->use == 1 && imm->rgroup == INST_RGROUP_UNIFORM_0);
    unsigned int idx = imm->reg * 4 + ((imm->swiz >> (swiz_idx * 2)) & 3);

    return c->imm_data[idx];
 }

 /* Allocate immediate with a certain float value. If there is already an
  * immediate with that value, return that.
  */
 static struct etna_inst_src
 alloc_imm_f32(struct etna_compile *c, float value)
 {
    return alloc_imm_u32(c, fui(value));
 }

 static struct etna_inst_src
 etna_imm_vec4f(struct etna_compile *c, const float *vec4)
 {
    uint32_t val[4];

    for (int i = 0; i < 4; i++)
       val[i] = fui(vec4[i]);

    return alloc_imm_vec4u(c, ETNA_IMMEDIATE_CONSTANT, val);
 }

 /* Pass -- check register file declarations and immediates */
 static void
 etna_compile_parse_declarations(struct etna_compile *c)
 {
    struct tgsi_parse_context ctx = { };
    unsigned status = TGSI_PARSE_OK;
    status = tgsi_parse_init(&ctx, c->tokens);
    assert(status == TGSI_PARSE_OK);

    while (!tgsi_parse_end_of_tokens(&ctx)) {
       tgsi_parse_token(&ctx);

       switch (ctx.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE: {
          /* immediates are handled differently from other files; they are
           * not declared explicitly, and always add four components */
          const struct tgsi_full_immediate *imm = &ctx.FullToken.FullImmediate;
          assert(c->imm_size <= (ETNA_MAX_IMM - 4));

          for (int i = 0; i < 4; ++i) {
             unsigned idx = c->imm_size++;

             c->imm_data[idx] = imm->u[i].Uint;
             c->imm_contents[idx] = ETNA_IMMEDIATE_CONSTANT;
          }
       }
       break;
       }
    }

    tgsi_parse_free(&ctx);
 }

 /* Allocate register declarations for the registers in all register files */
 static void
 etna_allocate_decls(struct etna_compile *c)
 {
    uint idx = 0;

    for (int x = 0; x < TGSI_FILE_COUNT; ++x) {
       c->file[x].reg = &c->decl[idx];
       c->file[x].reg_size = c->info.file_max[x] + 1;

       for (int sub = 0; sub < c->file[x].reg_size; ++sub) {
          c->decl[idx].file = x;
          c->decl[idx].idx = sub;
          idx++;
       }
    }

    c->total_decls = idx;
 }

 /* Pass -- check and record usage of temporaries, inputs, outputs */
 static void
 etna_compile_pass_check_usage(struct etna_compile *c)
 {
    struct tgsi_parse_context ctx = { };
    unsigned status = TGSI_PARSE_OK;
    status = tgsi_parse_init(&ctx, c->tokens);
    assert(status == TGSI_PARSE_OK);

    for (int idx = 0; idx < c->total_decls; ++idx) {
       c->decl[idx].active = false;
       c->decl[idx].first_use = c->decl[idx].last_use = -1;
    }

    int inst_idx = 0;
    while (!tgsi_parse_end_of_tokens(&ctx)) {
       tgsi_parse_token(&ctx);
       /* find out max register #s used
        * For every register mark first and last instruction index where it's
        * used this allows finding ranges where the temporary can be borrowed
        * as input and/or output register
        *
        * XXX in the case of loops this needs special care, or even be completely
        * disabled, as
        * the last usage of a register inside a loop means it can still be used
        * on next loop
        * iteration (execution is no longer * chronological). The register can
        * only be
        * declared "free" after the loop finishes.
        *
        * Same for inputs: the first usage of a register inside a loop doesn't
        * mean that the register
        * won't have been overwritten in previous iteration. The register can
        * only be declared free before the loop
        * starts.
        * The proper way would be to do full dominator / post-dominator analysis
        * (especially with more complicated
        * control flow such as direct branch instructions) but not for now...
        */
       switch (ctx.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_DECLARATION: {
          /* Declaration: fill in file details */
          const struct tgsi_full_declaration *decl = &ctx.FullToken.FullDeclaration;
          struct etna_compile_file *file = &c->file[decl->Declaration.File];

          for (int idx = decl->Range.First; idx <= decl->Range.Last; ++idx) {
             file->reg[idx].usage_mask = 0; // we'll compute this ourselves
             file->reg[idx].has_semantic = decl->Declaration.Semantic;
             file->reg[idx].semantic = decl->Semantic;
             file->reg[idx].interp = decl->Interp;
          }
       } break;
       case TGSI_TOKEN_TYPE_INSTRUCTION: {
          /* Instruction: iterate over operands of instruction */
          const struct tgsi_full_instruction *inst = &ctx.FullToken.FullInstruction;

          /* iterate over destination registers */
          for (int idx = 0; idx < inst->Instruction.NumDstRegs; ++idx) {
             struct etna_reg_desc *reg_desc = &c->file[inst->Dst[idx].Register.File].reg[inst->Dst[idx].Register.Index];

             if (reg_desc->first_use == -1)
                reg_desc->first_use = inst_idx;

             reg_desc->last_use = inst_idx;
             reg_desc->active = true;
          }

          /* iterate over source registers */
          for (int idx = 0; idx < inst->Instruction.NumSrcRegs; ++idx) {
             struct etna_reg_desc *reg_desc = &c->file[inst->Src[idx].Register.File].reg[inst->Src[idx].Register.Index];

             if (reg_desc->first_use == -1)
                reg_desc->first_use = inst_idx;

             reg_desc->last_use = inst_idx;
             reg_desc->active = true;
             /* accumulate usage mask for register, this is used to determine how
              * many slots for varyings
              * should be allocated */
             reg_desc->usage_mask |= tgsi_util_get_inst_usage_mask(inst, idx);
          }
          inst_idx += 1;
       } break;
       default:
          break;
       }
    }

    tgsi_parse_free(&ctx);
 }

 /* assign inputs that need to be assigned to specific registers */
 static void
 assign_special_inputs(struct etna_compile *c)
 {
    if (c->info.processor == PIPE_SHADER_FRAGMENT) {
       /* never assign t0 as it is the position output, start assigning at t1 */
       c->next_free_native = 1;

       /* hardwire TGSI_SEMANTIC_POSITION (input and output) to t0 */
       for (int idx = 0; idx < c->total_decls; ++idx) {
          struct etna_reg_desc *reg = &c->decl[idx];

          if (reg->active && reg->semantic.Name == TGSI_SEMANTIC_POSITION)
             reg->native = etna_native_temp(0);
       }
    }
 }

 /* Check that a move instruction does not swizzle any of the components
  * that it writes.
  */
 static bool
 etna_mov_check_no_swizzle(const struct tgsi_dst_register dst,
                           const struct tgsi_src_register src)
 {
    return (!(dst.WriteMask & TGSI_WRITEMASK_X) || src.SwizzleX == TGSI_SWIZZLE_X) &&
           (!(dst.WriteMask & TGSI_WRITEMASK_Y) || src.SwizzleY == TGSI_SWIZZLE_Y) &&
           (!(dst.WriteMask & TGSI_WRITEMASK_Z) || src.SwizzleZ == TGSI_SWIZZLE_Z) &&
           (!(dst.WriteMask & TGSI_WRITEMASK_W) || src.SwizzleW == TGSI_SWIZZLE_W);
 }

 /* Pass -- optimize outputs
  * Mesa tends to generate code like this at the end if their shaders
  *   MOV OUT[1], TEMP[2]
  *   MOV OUT[0], TEMP[0]
  *   MOV OUT[2], TEMP[1]
  * Recognize if
  * a) there is only a single assignment to an output register and
  * b) the temporary is not used after that
  * Also recognize direct assignment of IN to OUT (passthrough)
  **/
 static void
 etna_compile_pass_optimize_outputs(struct etna_compile *c)
 {
    struct tgsi_parse_context ctx = { };
    int inst_idx = 0;
    unsigned status = TGSI_PARSE_OK;
    status = tgsi_parse_init(&ctx, c->tokens);
    assert(status == TGSI_PARSE_OK);

    while (!tgsi_parse_end_of_tokens(&ctx)) {
       tgsi_parse_token(&ctx);

       switch (ctx.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_INSTRUCTION: {
          const struct tgsi_full_instruction *inst = &ctx.FullToken.FullInstruction;

          /* iterate over operands */
          switch (inst->Instruction.Opcode) {
          case TGSI_OPCODE_MOV: {
             /* We are only interested in eliminating MOVs which write to
              * the shader outputs. Test for this early. */
             if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
                break;
             /* Elimination of a MOV must have no visible effect on the
              * resulting shader: this means the MOV must not swizzle or
              * saturate, and its source must not have the negate or
              * absolute modifiers. */
             if (!etna_mov_check_no_swizzle(inst->Dst[0].Register, inst->Src[0].Register) ||
                 inst->Instruction.Saturate || inst->Src[0].Register.Negate ||
                 inst->Src[0].Register.Absolute)
                break;

             uint out_idx = inst->Dst[0].Register.Index;
             uint in_idx = inst->Src[0].Register.Index;
             /* assignment of temporary to output --
              * and the output doesn't yet have a native register assigned
              * and the last use of the temporary is this instruction
              * and the MOV does not do a swizzle
              */
             if (inst->Src[0].Register.File == TGSI_FILE_TEMPORARY &&
                 !c->file[TGSI_FILE_OUTPUT].reg[out_idx].native.valid &&
                 c->file[TGSI_FILE_TEMPORARY].reg[in_idx].last_use == inst_idx) {
                c->file[TGSI_FILE_OUTPUT].reg[out_idx].native =
                   c->file[TGSI_FILE_TEMPORARY].reg[in_idx].native;
                /* prevent temp from being re-used for the rest of the shader */
                c->file[TGSI_FILE_TEMPORARY].reg[in_idx].last_use = ETNA_MAX_TOKENS;
                /* mark this MOV instruction as a no-op */
                c->dead_inst[inst_idx] = true;
             }
             /* direct assignment of input to output --
              * and the input or output doesn't yet have a native register
              * assigned
              * and the output is only used in this instruction,
              * allocate a new register, and associate both input and output to
              * it
              * and the MOV does not do a swizzle
              */
             if (inst->Src[0].Register.File == TGSI_FILE_INPUT &&
                 !c->file[TGSI_FILE_INPUT].reg[in_idx].native.valid &&
                 !c->file[TGSI_FILE_OUTPUT].reg[out_idx].native.valid &&
                 c->file[TGSI_FILE_OUTPUT].reg[out_idx].last_use == inst_idx &&
                 c->file[TGSI_FILE_OUTPUT].reg[out_idx].first_use == inst_idx) {
                c->file[TGSI_FILE_OUTPUT].reg[out_idx].native =
                   c->file[TGSI_FILE_INPUT].reg[in_idx].native =
                      alloc_new_native_reg(c);
                /* mark this MOV instruction as a no-op */
                c->dead_inst[inst_idx] = true;
             }
          } break;
          default:;
          }
          inst_idx += 1;
       } break;
       }
    }

    tgsi_parse_free(&ctx);
 }

 /* Get a temporary to be used within one TGSI instruction.
  * The first time that this function is called the temporary will be allocated.
  * Each call to this function will return the same temporary.
  */
 static struct etna_native_reg
 etna_compile_get_inner_temp(struct etna_compile *c)
 {
    int inner_temp = c->inner_temps;

    if (inner_temp < ETNA_MAX_INNER_TEMPS) {
       if (!c->inner_temp[inner_temp].valid)
          c->inner_temp[inner_temp] = alloc_new_native_reg(c);

       /* alloc_new_native_reg() handles lack of registers */
       c->inner_temps += 1;
    } else {
       BUG("Too many inner temporaries (%i) requested in one instruction",
           inner_temp + 1);
    }

    return c->inner_temp[inner_temp];
 }

 static struct etna_inst_dst
 etna_native_to_dst(struct etna_native_reg native, unsigned comps)
 {
    /* Can only assign to temporaries */
    assert(native.valid && !native.is_tex && native.rgroup == INST_RGROUP_TEMP);

    struct etna_inst_dst rv = {
       .comps = comps,
       .use = 1,
       .reg = native.id,
    };

    return rv;
 }

 static struct etna_inst_src
 etna_native_to_src(struct etna_native_reg native, uint32_t swizzle)
 {
    assert(native.valid && !native.is_tex);

    struct etna_inst_src rv = {
       .use = 1,
       .swiz = swizzle,
       .rgroup = native.rgroup,
       .reg = native.id,
       .amode = INST_AMODE_DIRECT,
    };

    return rv;
 }

 static inline struct etna_inst_src
 negate(struct etna_inst_src src)
 {
    src.neg = !src.neg;

    return src;
 }

 static inline struct etna_inst_src
 absolute(struct etna_inst_src src)
 {
    src.abs = 1;

    return src;
 }

 static inline struct etna_inst_src
 swizzle(struct etna_inst_src src, unsigned swizzle)
 {
    src.swiz = inst_swiz_compose(src.swiz, swizzle);

    return src;
 }

 /* Emit instruction and append it to program */
 static void
 emit_inst(struct etna_compile *c, struct etna_inst *inst)
 {
    assert(c->inst_ptr <= ETNA_MAX_INSTRUCTIONS);

    /* Check for uniform conflicts (each instruction can only access one
     * uniform),
     * if detected, use an intermediate temporary */
    unsigned uni_rgroup = -1;
    unsigned uni_reg = -1;

    for (int src = 0; src < ETNA_NUM_SRC; ++src) {
       if (etna_rgroup_is_uniform(inst->src[src].rgroup)) {
          if (uni_reg == -1) { /* first unique uniform used */
             uni_rgroup = inst->src[src].rgroup;
             uni_reg = inst->src[src].reg;
          } else { /* second or later; check that it is a re-use */
             if (uni_rgroup != inst->src[src].rgroup ||
                 uni_reg != inst->src[src].reg) {
                DBG_F(ETNA_DBG_COMPILER_MSGS, "perf warning: instruction that "
                                              "accesses different uniforms, "
                                              "need to generate extra MOV");
                struct etna_native_reg inner_temp = etna_compile_get_inner_temp(c);

                /* Generate move instruction to temporary */
                etna_assemble(&c->code[c->inst_ptr * 4], &(struct etna_inst) {
                   .opcode = INST_OPCODE_MOV,
                   .dst = etna_native_to_dst(inner_temp, INST_COMPS_X | INST_COMPS_Y |
                                                         INST_COMPS_Z | INST_COMPS_W),
                   .src[2] = inst->src[src]
                });

                c->inst_ptr++;

                /* Modify instruction to use temp register instead of uniform */
                inst->src[src].use = 1;
                inst->src[src].rgroup = INST_RGROUP_TEMP;
                inst->src[src].reg = inner_temp.id;
                inst->src[src].swiz = INST_SWIZ_IDENTITY; /* swizzling happens on MOV */
                inst->src[src].neg = 0; /* negation happens on MOV */
                inst->src[src].abs = 0; /* abs happens on MOV */
                inst->src[src].amode = 0; /* amode effects happen on MOV */
             }
          }
       }
    }

    /* Finally assemble the actual instruction */
    etna_assemble(&c->code[c->inst_ptr * 4], inst);
    c->inst_ptr++;
 }

 static unsigned int
 etna_amode(struct tgsi_ind_register indirect)
 {
    assert(indirect.File == TGSI_FILE_ADDRESS);
    assert(indirect.Index == 0);

    switch (indirect.Swizzle) {
    case TGSI_SWIZZLE_X:
       return INST_AMODE_ADD_A_X;
    case TGSI_SWIZZLE_Y:
       return INST_AMODE_ADD_A_Y;
    case TGSI_SWIZZLE_Z:
       return INST_AMODE_ADD_A_Z;
    case TGSI_SWIZZLE_W:
       return INST_AMODE_ADD_A_W;
    default:
       assert(!"Invalid swizzle");
    }

    unreachable("bad swizzle");
 }

 /* convert destination operand */
 static struct etna_inst_dst
 convert_dst(struct etna_compile *c, const struct tgsi_full_dst_register *in)
 {
    struct etna_inst_dst rv = {
       /// XXX .amode
       .comps = in->Register.WriteMask,
    };

    if (in->Register.File == TGSI_FILE_ADDRESS) {
       assert(in->Register.Index == 0);
       rv.reg = in->Register.Index;
       rv.use = 0;
    } else {
       rv = etna_native_to_dst(etna_get_dst_reg(c, in->Register)->native,
                               in->Register.WriteMask);
    }

    if (in->Register.Indirect)
       rv.amode = etna_amode(in->Indirect);

    return rv;
 }

 /* convert texture operand */
 static struct etna_inst_tex
 convert_tex(struct etna_compile *c, const struct tgsi_full_src_register *in,
             const struct tgsi_instruction_texture *tex)
 {
    struct etna_native_reg native_reg = etna_get_src_reg(c, in->Register)->native;
    struct etna_inst_tex rv = {
       // XXX .amode (to allow for an array of samplers?)
       .swiz = INST_SWIZ_IDENTITY
    };

    assert(native_reg.is_tex && native_reg.valid);
    rv.id = native_reg.id;

    return rv;
 }

 /* convert source operand */
 static struct etna_inst_src
 etna_create_src(const struct tgsi_full_src_register *tgsi,
                 const struct etna_native_reg *native)
 {
    const struct tgsi_src_register *reg = &tgsi->Register;
    struct etna_inst_src rv = {
       .use = 1,
       .swiz = INST_SWIZ(reg->SwizzleX, reg->SwizzleY, reg->SwizzleZ, reg->SwizzleW),
       .neg = reg->Negate,
       .abs = reg->Absolute,
       .rgroup = native->rgroup,
       .reg = native->id,
       .amode = INST_AMODE_DIRECT,
    };

    assert(native->valid && !native->is_tex);

    if (reg->Indirect)
       rv.amode = etna_amode(tgsi->Indirect);

    return rv;
 }

 static struct etna_inst_src
 etna_mov_src_to_temp(struct etna_compile *c, struct etna_inst_src src,
                      struct etna_native_reg temp)
 {
    struct etna_inst mov = { };

    mov.opcode = INST_OPCODE_MOV;
    mov.sat = 0;
    mov.dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
                                       INST_COMPS_Z | INST_COMPS_W);
    mov.src[2] = src;
    emit_inst(c, &mov);

    src.swiz = INST_SWIZ_IDENTITY;
    src.neg = src.abs = 0;
    src.rgroup = temp.rgroup;
    src.reg = temp.id;

    return src;
 }

 static struct etna_inst_src
 etna_mov_src(struct etna_compile *c, struct etna_inst_src src)
 {
    struct etna_native_reg temp = etna_compile_get_inner_temp(c);

    return etna_mov_src_to_temp(c, src, temp);
 }

 static bool
 etna_src_uniforms_conflict(struct etna_inst_src a, struct etna_inst_src b)
 {
    return etna_rgroup_is_uniform(a.rgroup) &&
           etna_rgroup_is_uniform(b.rgroup) &&
           (a.rgroup != b.rgroup || a.reg != b.reg);
 }

 /* create a new label */
 static unsigned int
 alloc_new_label(struct etna_compile *c)
 {
    struct etna_compile_label label = {
       .inst_idx = -1, /* start by point to no specific instruction */
    };

    array_insert(c->labels, label);

    return c->labels_count - 1;
 }

 /* place label at current instruction pointer */
 static void
 label_place(struct etna_compile *c, struct etna_compile_label *label)
 {
    label->inst_idx = c->inst_ptr;
 }

 /* mark label use at current instruction.
  * target of the label will be filled in in the marked instruction's src2.imm
  * slot as soon
  * as the value becomes known.
  */
 static void
 label_mark_use(struct etna_compile *c, int lbl_idx)
 {
    assert(c->inst_ptr < ETNA_MAX_INSTRUCTIONS);
    c->lbl_usage[c->inst_ptr] = lbl_idx;
 }

 /* walk the frame stack and return first frame with matching type */
 static struct etna_compile_frame *
 find_frame(struct etna_compile *c, enum etna_compile_frame_type type)
 {
    for (int sp = c->frame_sp; sp >= 0; sp--)
       if (c->frame_stack[sp].type == type)
          return &c->frame_stack[sp];

    assert(0);
    return NULL;
 }

 struct instr_translater {
    void (*fxn)(const struct instr_translater *t, struct etna_compile *c,
                const struct tgsi_full_instruction *inst,
                struct etna_inst_src *src);
    unsigned tgsi_opc;
    uint8_t opc;

    /* tgsi src -> etna src swizzle */
    int src[3];

    unsigned cond;
 };

 static void
 trans_instr(const struct instr_translater *t, struct etna_compile *c,
             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    const struct tgsi_opcode_info *info = tgsi_get_opcode_info(inst->Instruction.Opcode);
    struct etna_inst instr = { };

    instr.opcode = t->opc;
    instr.cond = t->cond;
    instr.sat = inst->Instruction.Saturate;

    assert(info->num_dst <= 1);
    if (info->num_dst)
       instr.dst = convert_dst(c, &inst->Dst[0]);

    assert(info->num_src <= ETNA_NUM_SRC);

    for (unsigned i = 0; i < info->num_src; i++) {
       int swizzle = t->src[i];

       assert(swizzle != -1);
       instr.src[swizzle] = src[i];
    }

    emit_inst(c, &instr);
 }

 static void
 trans_min_max(const struct instr_translater *t, struct etna_compile *c,
               const struct tgsi_full_instruction *inst,
               struct etna_inst_src *src)
 {
    emit_inst(c, &(struct etna_inst) {
       .opcode = INST_OPCODE_SELECT,
        .cond = t->cond,
        .sat = inst->Instruction.Saturate,
        .dst = convert_dst(c, &inst->Dst[0]),
        .src[0] = src[0],
        .src[1] = src[1],
        .src[2] = src[0],
     });
 }

 static void
 trans_if(const struct instr_translater *t, struct etna_compile *c,
          const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp++];
    struct etna_inst_src imm_0 = alloc_imm_f32(c, 0.0f);

    /* push IF to stack */
    f->type = ETNA_COMPILE_FRAME_IF;
    /* create "else" label */
    f->lbl_else_idx = alloc_new_label(c);
    f->lbl_endif_idx = -1;

    /* We need to avoid the emit_inst() below becoming two instructions */
    if (etna_src_uniforms_conflict(src[0], imm_0))
       src[0] = etna_mov_src(c, src[0]);

    /* mark position in instruction stream of label reference so that it can be
     * filled in in next pass */
    label_mark_use(c, f->lbl_else_idx);

    /* create conditional branch to label if src0 EQ 0 */
    emit_inst(c, &(struct etna_inst){
       .opcode = INST_OPCODE_BRANCH,
       .cond = INST_CONDITION_EQ,
       .src[0] = src[0],
       .src[1] = imm_0,
     /* imm is filled in later */
    });
 }

 static void
 trans_else(const struct instr_translater *t, struct etna_compile *c,
            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    assert(c->frame_sp > 0);
    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp - 1];
    assert(f->type == ETNA_COMPILE_FRAME_IF);

    /* create "endif" label, and branch to endif label */
    f->lbl_endif_idx = alloc_new_label(c);
    label_mark_use(c, f->lbl_endif_idx);
    emit_inst(c, &(struct etna_inst) {
       .opcode = INST_OPCODE_BRANCH,
       .cond = INST_CONDITION_TRUE,
       /* imm is filled in later */
    });

    /* mark "else" label at this position in instruction stream */
    label_place(c, &c->labels[f->lbl_else_idx]);
 }

 static void
 trans_endif(const struct instr_translater *t, struct etna_compile *c,
             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    assert(c->frame_sp > 0);
    struct etna_compile_frame *f = &c->frame_stack[--c->frame_sp];
    assert(f->type == ETNA_COMPILE_FRAME_IF);

    /* assign "endif" or "else" (if no ELSE) label to current position in
     * instruction stream, pop IF */
    if (f->lbl_endif_idx != -1)
       label_place(c, &c->labels[f->lbl_endif_idx]);
    else
       label_place(c, &c->labels[f->lbl_else_idx]);
 }

 static void
 trans_loop_bgn(const struct instr_translater *t, struct etna_compile *c,
                const struct tgsi_full_instruction *inst,
                struct etna_inst_src *src)
 {
    struct etna_compile_frame *f = &c->frame_stack[c->frame_sp++];

    /* push LOOP to stack */
    f->type = ETNA_COMPILE_FRAME_LOOP;
    f->lbl_loop_bgn_idx = alloc_new_label(c);
    f->lbl_loop_end_idx = alloc_new_label(c);

    label_place(c, &c->labels[f->lbl_loop_bgn_idx]);

    c->num_loops++;
 }

 static void
 trans_loop_end(const struct instr_translater *t, struct etna_compile *c,
                const struct tgsi_full_instruction *inst,
                struct etna_inst_src *src)
 {
    assert(c->frame_sp > 0);
    struct etna_compile_frame *f = &c->frame_stack[--c->frame_sp];
    assert(f->type == ETNA_COMPILE_FRAME_LOOP);

    /* mark position in instruction stream of label reference so that it can be
     * filled in in next pass */
    label_mark_use(c, f->lbl_loop_bgn_idx);

    /* create branch to loop_bgn label */
    emit_inst(c, &(struct etna_inst) {
       .opcode = INST_OPCODE_BRANCH,
       .cond = INST_CONDITION_TRUE,
       .src[0] = src[0],
       /* imm is filled in later */
    });

    label_place(c, &c->labels[f->lbl_loop_end_idx]);
 }

 static void
 trans_brk(const struct instr_translater *t, struct etna_compile *c,
           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    assert(c->frame_sp > 0);
    struct etna_compile_frame *f = find_frame(c, ETNA_COMPILE_FRAME_LOOP);

    /* mark position in instruction stream of label reference so that it can be
     * filled in in next pass */
    label_mark_use(c, f->lbl_loop_end_idx);

    /* create branch to loop_end label */
    emit_inst(c, &(struct etna_inst) {
       .opcode = INST_OPCODE_BRANCH,
       .cond = INST_CONDITION_TRUE,
       .src[0] = src[0],
       /* imm is filled in later */
    });
 }

 static void
 trans_cont(const struct instr_translater *t, struct etna_compile *c,
            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    assert(c->frame_sp > 0);
    struct etna_compile_frame *f = find_frame(c, ETNA_COMPILE_FRAME_LOOP);

    /* mark position in instruction stream of label reference so that it can be
     * filled in in next pass */
    label_mark_use(c, f->lbl_loop_bgn_idx);

    /* create branch to loop_end label */
    emit_inst(c, &(struct etna_inst) {
       .opcode = INST_OPCODE_BRANCH,
       .cond = INST_CONDITION_TRUE,
       .src[0] = src[0],
       /* imm is filled in later */
    });
 }

 static void
 trans_deriv(const struct instr_translater *t, struct etna_compile *c,
             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    emit_inst(c, &(struct etna_inst) {
       .opcode = t->opc,
       .sat = inst->Instruction.Saturate,
       .dst = convert_dst(c, &inst->Dst[0]),
       .src[0] = src[0],
       .src[2] = src[0],
    });
 }

 static void
 trans_arl(const struct instr_translater *t, struct etna_compile *c,
           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
    struct etna_inst arl = { };
    struct etna_inst_dst dst;

    dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y | INST_COMPS_Z |
                                   INST_COMPS_W);

    if (c->specs->has_sign_floor_ceil) {
       struct etna_inst floor = { };

       floor.opcode = INST_OPCODE_FLOOR;
       floor.src[2] = src[0];
       floor.dst = dst;

       emit_inst(c, &floor);
    } else {
       struct etna_inst floor[2] = { };

       floor[0].opcode = INST_OPCODE_FRC;
       floor[0].sat = inst->Instruction.Saturate;
       floor[0].dst = dst;
       floor[0].src[2] = src[0];

       floor[1].opcode = INST_OPCODE_ADD;
       floor[1].sat = inst->Instruction.Saturate;
       floor[1].dst = dst;
       floor[1].src[0] = src[0];
       floor[1].src[2].use = 1;
       floor[1].src[2].swiz = INST_SWIZ_IDENTITY;
       floor[1].src[2].neg = 1;
       floor[1].src[2].rgroup = temp.rgroup;
       floor[1].src[2].reg = temp.id;

       emit_inst(c, &floor[0]);
       emit_inst(c, &floor[1]);
    }

    arl.opcode = INST_OPCODE_MOVAR;
    arl.sat = inst->Instruction.Saturate;
    arl.dst = convert_dst(c, &inst->Dst[0]);
    arl.src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);

    emit_inst(c, &arl);
 }

 static void
 trans_lrp(const struct instr_translater *t, struct etna_compile *c,
           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    /* dst = src0 * src1 + (1 - src0) * src2
     *     => src0 * src1 - (src0 - 1) * src2
     *     => src0 * src1 - (src0 * src2 - src2)
     * MAD tTEMP.xyzw, tSRC0.xyzw, tSRC2.xyzw, -tSRC2.xyzw
     * MAD tDST.xyzw, tSRC0.xyzw, tSRC1.xyzw, -tTEMP.xyzw
     */
    struct etna_native_reg temp = etna_compile_get_inner_temp(c);
    if (etna_src_uniforms_conflict(src[0], src[1]) ||
        etna_src_uniforms_conflict(src[0], src[2])) {
       src[0] = etna_mov_src(c, src[0]);
    }

    struct etna_inst mad[2] = { };
    mad[0].opcode = INST_OPCODE_MAD;
    mad[0].sat = 0;
    mad[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
                                          INST_COMPS_Z | INST_COMPS_W);
    mad[0].src[0] = src[0];
    mad[0].src[1] = src[2];
    mad[0].src[2] = negate(src[2]);
    mad[1].opcode = INST_OPCODE_MAD;
    mad[1].sat = inst->Instruction.Saturate;
    mad[1].dst = convert_dst(c, &inst->Dst[0]), mad[1].src[0] = src[0];
    mad[1].src[1] = src[1];
    mad[1].src[2] = negate(etna_native_to_src(temp, INST_SWIZ_IDENTITY));

    emit_inst(c, &mad[0]);
    emit_inst(c, &mad[1]);
 }

 static void
 trans_lit(const struct instr_translater *t, struct etna_compile *c,
           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    /* SELECT.LT tmp._y__, 0, src.yyyy, 0
     *  - can be eliminated if src.y is a uniform and >= 0
     * SELECT.GT tmp.___w, 128, src.wwww, 128
     * SELECT.LT tmp.___w, -128, tmp.wwww, -128
     *  - can be eliminated if src.w is a uniform and fits clamp
     * LOG tmp.x, void, void, tmp.yyyy
     * MUL tmp.x, tmp.xxxx, tmp.wwww, void
     * LITP dst, undef, src.xxxx, tmp.xxxx
     */
    struct etna_native_reg inner_temp = etna_compile_get_inner_temp(c);
    struct etna_inst_src src_y = { };

    if (!etna_rgroup_is_uniform(src[0].rgroup)) {
       src_y = etna_native_to_src(inner_temp, SWIZZLE(Y, Y, Y, Y));

       struct etna_inst ins = { };
       ins.opcode = INST_OPCODE_SELECT;
       ins.cond = INST_CONDITION_LT;
       ins.dst = etna_native_to_dst(inner_temp, INST_COMPS_Y);
       ins.src[0] = ins.src[2] = alloc_imm_f32(c, 0.0);
       ins.src[1] = swizzle(src[0], SWIZZLE(Y, Y, Y, Y));
       emit_inst(c, &ins);
    } else if (uif(get_imm_u32(c, &src[0], 1)) < 0)
       src_y = alloc_imm_f32(c, 0.0);
    else
       src_y = swizzle(src[0], SWIZZLE(Y, Y, Y, Y));

    struct etna_inst_src src_w = { };

    if (!etna_rgroup_is_uniform(src[0].rgroup)) {
       src_w = etna_native_to_src(inner_temp, SWIZZLE(W, W, W, W));

       struct etna_inst ins = { };
       ins.opcode = INST_OPCODE_SELECT;
       ins.cond = INST_CONDITION_GT;
       ins.dst = etna_native_to_dst(inner_temp, INST_COMPS_W);
       ins.src[0] = ins.src[2] = alloc_imm_f32(c, 128.);
       ins.src[1] = swizzle(src[0], SWIZZLE(W, W, W, W));
       emit_inst(c, &ins);
       ins.cond = INST_CONDITION_LT;
       ins.src[0].neg = !ins.src[0].neg;
       ins.src[2].neg = !ins.src[2].neg;
       ins.src[1] = src_w;
       emit_inst(c, &ins);
    } else if (uif(get_imm_u32(c, &src[0], 3)) < -128.)
       src_w = alloc_imm_f32(c, -128.);
    else if (uif(get_imm_u32(c, &src[0], 3)) > 128.)
       src_w = alloc_imm_f32(c, 128.);
    else
       src_w = swizzle(src[0], SWIZZLE(W, W, W, W));

    if (c->specs->has_new_transcendentals) { /* Alternative LOG sequence */
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_LOG,
          .dst = etna_native_to_dst(inner_temp, INST_COMPS_X | INST_COMPS_Y),
          .src[2] = src_y,
          .tex = { .amode=1 }, /* Unknown bit needs to be set */
       });
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_MUL,
          .dst = etna_native_to_dst(inner_temp, INST_COMPS_X),
          .src[0] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
          .src[1] = etna_native_to_src(inner_temp, SWIZZLE(Y, Y, Y, Y)),
       });
    } else {
       struct etna_inst ins[3] = { };
       ins[0].opcode = INST_OPCODE_LOG;
       ins[0].dst = etna_native_to_dst(inner_temp, INST_COMPS_X);
       ins[0].src[2] = src_y;

       emit_inst(c, &ins[0]);
    }
    emit_inst(c, &(struct etna_inst) {
       .opcode = INST_OPCODE_MUL,
       .sat = 0,
       .dst = etna_native_to_dst(inner_temp, INST_COMPS_X),
       .src[0] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
       .src[1] = src_w,
    });
    emit_inst(c, &(struct etna_inst) {
       .opcode = INST_OPCODE_LITP,
       .sat = 0,
       .dst = convert_dst(c, &inst->Dst[0]),
       .src[0] = swizzle(src[0], SWIZZLE(X, X, X, X)),
       .src[1] = swizzle(src[0], SWIZZLE(X, X, X, X)),
       .src[2] = etna_native_to_src(inner_temp, SWIZZLE(X, X, X, X)),
    });
 }

 static void
 trans_ssg(const struct instr_translater *t, struct etna_compile *c,
           const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    if (c->specs->has_sign_floor_ceil) {
       emit_inst(c, &(struct etna_inst){
          .opcode = INST_OPCODE_SIGN,
          .sat = inst->Instruction.Saturate,
          .dst = convert_dst(c, &inst->Dst[0]),
          .src[2] = src[0],
       });
    } else {
       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
       struct etna_inst ins[2] = { };

       ins[0].opcode = INST_OPCODE_SET;
       ins[0].cond = INST_CONDITION_NZ;
       ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
                                             INST_COMPS_Z | INST_COMPS_W);
       ins[0].src[0] = src[0];

       ins[1].opcode = INST_OPCODE_SELECT;
       ins[1].cond = INST_CONDITION_LZ;
       ins[1].sat = inst->Instruction.Saturate;
       ins[1].dst = convert_dst(c, &inst->Dst[0]);
       ins[1].src[0] = src[0];
       ins[1].src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY);
       ins[1].src[1] = negate(ins[1].src[2]);

       emit_inst(c, &ins[0]);
       emit_inst(c, &ins[1]);
    }
 }

 static void
 trans_trig(const struct instr_translater *t, struct etna_compile *c,
            const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    if (c->specs->has_new_transcendentals) { /* Alternative SIN/COS */
       /* On newer chips alternative SIN/COS instructions are implemented,
        * which:
        * - Need their input scaled by 1/pi instead of 2/pi
        * - Output an x and y component, which need to be multiplied to
        *   get the result
        */
       struct etna_native_reg temp = etna_compile_get_inner_temp(c); /* only using .xyz */
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_MUL,
          .sat = 0,
          .dst = etna_native_to_dst(temp, INST_COMPS_Z),
          .src[0] = src[0], /* any swizzling happens here */
          .src[1] = alloc_imm_f32(c, 1.0f / M_PI),
       });
       emit_inst(c, &(struct etna_inst) {
          .opcode = inst->Instruction.Opcode == TGSI_OPCODE_COS
                     ? INST_OPCODE_COS
                     : INST_OPCODE_SIN,
          .sat = 0,
          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y),
          .src[2] = etna_native_to_src(temp, SWIZZLE(Z, Z, Z, Z)),
          .tex = { .amode=1 }, /* Unknown bit needs to be set */
       });
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_MUL,
          .sat = inst->Instruction.Saturate,
          .dst = convert_dst(c, &inst->Dst[0]),
          .src[0] = etna_native_to_src(temp, SWIZZLE(X, X, X, X)),
          .src[1] = etna_native_to_src(temp, SWIZZLE(Y, Y, Y, Y)),
       });

    } else if (c->specs->has_sin_cos_sqrt) {
       struct etna_native_reg temp = etna_compile_get_inner_temp(c);
       /* add divide by PI/2, using a temp register. GC2000
        * fails with src==dst for the trig instruction. */
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_MUL,
          .sat = 0,
          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
                                          INST_COMPS_Z | INST_COMPS_W),
          .src[0] = src[0], /* any swizzling happens here */
          .src[1] = alloc_imm_f32(c, 2.0f / M_PI),
       });
       emit_inst(c, &(struct etna_inst) {
          .opcode = inst->Instruction.Opcode == TGSI_OPCODE_COS
                     ? INST_OPCODE_COS
                     : INST_OPCODE_SIN,
          .sat = inst->Instruction.Saturate,
          .dst = convert_dst(c, &inst->Dst[0]),
          .src[2] = etna_native_to_src(temp, INST_SWIZ_IDENTITY),
       });
    } else {
       /* Implement Nick's fast sine/cosine. Taken from:
        * http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
        * A=(1/2*PI 0 1/2*PI 0) B=(0.75 0 0.5 0) C=(-4 4 X X)
        *  MAD t.x_zw, src.xxxx, A, B
        *  FRC t.x_z_, void, void, t.xwzw
        *  MAD t.x_z_, t.xwzw, 2, -1
        *  MUL t._y__, t.wzww, |t.wzww|, void  (for sin/scs)
        *  DP3 t.x_z_, t.zyww, C, void         (for sin)
        *  DP3 t.__z_, t.zyww, C, void         (for scs)
        *  MUL t._y__, t.wxww, |t.wxww|, void  (for cos/scs)
        *  DP3 t.x_z_, t.xyww, C, void         (for cos)
        *  DP3 t.x___, t.xyww, C, void         (for scs)
        *  MAD t._y_w, t,xxzz, |t.xxzz|, -t.xxzz
        *  MAD dst, t.ywyw, .2225, t.xzxz
        */
       struct etna_inst *p, ins[9] = { };
       struct etna_native_reg t0 = etna_compile_get_inner_temp(c);
       struct etna_inst_src t0s = etna_native_to_src(t0, INST_SWIZ_IDENTITY);
       struct etna_inst_src sincos[3], in = src[0];
       sincos[0] = etna_imm_vec4f(c, sincos_const[0]);
       sincos[1] = etna_imm_vec4f(c, sincos_const[1]);

       /* A uniform source will cause the inner temp limit to
        * be exceeded.  Explicitly deal with that scenario.
        */
       if (etna_rgroup_is_uniform(src[0].rgroup)) {
          struct etna_inst ins = { };
          ins.opcode = INST_OPCODE_MOV;
          ins.dst = etna_native_to_dst(t0, INST_COMPS_X);
          ins.src[2] = in;
          emit_inst(c, &ins);
          in = t0s;
       }

       ins[0].opcode = INST_OPCODE_MAD;
       ins[0].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z | INST_COMPS_W);
       ins[0].src[0] = swizzle(in, SWIZZLE(X, X, X, X));
       ins[0].src[1] = swizzle(sincos[1], SWIZZLE(X, W, X, W)); /* 1/2*PI */
       ins[0].src[2] = swizzle(sincos[1], SWIZZLE(Y, W, Z, W)); /* 0.75, 0, 0.5, 0 */

       ins[1].opcode = INST_OPCODE_FRC;
       ins[1].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
       ins[1].src[2] = swizzle(t0s, SWIZZLE(X, W, Z, W));

       ins[2].opcode = INST_OPCODE_MAD;
       ins[2].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
       ins[2].src[0] = swizzle(t0s, SWIZZLE(X, W, Z, W));
       ins[2].src[1] = swizzle(sincos[0], SWIZZLE(X, X, X, X)); /* 2 */
       ins[2].src[2] = swizzle(sincos[0], SWIZZLE(Y, Y, Y, Y)); /* -1 */

       unsigned mul_swiz, dp3_swiz;
       if (inst->Instruction.Opcode == TGSI_OPCODE_SIN) {
          mul_swiz = SWIZZLE(W, Z, W, W);
          dp3_swiz = SWIZZLE(Z, Y, W, W);
       } else {
          mul_swiz = SWIZZLE(W, X, W, W);
          dp3_swiz = SWIZZLE(X, Y, W, W);
       }

       ins[3].opcode = INST_OPCODE_MUL;
       ins[3].dst = etna_native_to_dst(t0, INST_COMPS_Y);
       ins[3].src[0] = swizzle(t0s, mul_swiz);
       ins[3].src[1] = absolute(ins[3].src[0]);

       ins[4].opcode = INST_OPCODE_DP3;
       ins[4].dst = etna_native_to_dst(t0, INST_COMPS_X | INST_COMPS_Z);
       ins[4].src[0] = swizzle(t0s, dp3_swiz);
       ins[4].src[1] = swizzle(sincos[0], SWIZZLE(Z, W, W, W));

       p = &ins[5];
       p->opcode = INST_OPCODE_MAD;
       p->dst = etna_native_to_dst(t0, INST_COMPS_Y | INST_COMPS_W);
       p->src[0] = swizzle(t0s, SWIZZLE(X, X, Z, Z));
       p->src[1] = absolute(p->src[0]);
       p->src[2] = negate(p->src[0]);

       p++;
       p->opcode = INST_OPCODE_MAD;
       p->sat = inst->Instruction.Saturate;
       p->dst = convert_dst(c, &inst->Dst[0]),
       p->src[0] = swizzle(t0s, SWIZZLE(Y, W, Y, W));
       p->src[1] = alloc_imm_f32(c, 0.2225);
       p->src[2] = swizzle(t0s, SWIZZLE(X, Z, X, Z));

       for (int i = 0; &ins[i] <= p; i++)
          emit_inst(c, &ins[i]);
    }
 }

 static void
 trans_lg2(const struct instr_translater *t, struct etna_compile *c,
             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    if (c->specs->has_new_transcendentals) {
       /* On newer chips alternative LOG instruction is implemented,
        * which outputs an x and y component, which need to be multiplied to
        * get the result.
        */
       struct etna_native_reg temp = etna_compile_get_inner_temp(c); /* only using .xy */
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_LOG,
          .sat = 0,
          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y),
          .src[2] = src[0],
          .tex = { .amode=1 }, /* Unknown bit needs to be set */
       });
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_MUL,
          .sat = inst->Instruction.Saturate,
          .dst = convert_dst(c, &inst->Dst[0]),
          .src[0] = etna_native_to_src(temp, SWIZZLE(X, X, X, X)),
          .src[1] = etna_native_to_src(temp, SWIZZLE(Y, Y, Y, Y)),
       });
    } else {
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_LOG,
          .sat = inst->Instruction.Saturate,
          .dst = convert_dst(c, &inst->Dst[0]),
          .src[2] = src[0],
       });
    }
 }

 static void
 trans_sampler(const struct instr_translater *t, struct etna_compile *c,
               const struct tgsi_full_instruction *inst,
               struct etna_inst_src *src)
 {
    /* There is no native support for GL texture rectangle coordinates, so
     * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, 1]). */
    if (inst->Texture.Texture == TGSI_TEXTURE_RECT) {
       uint32_t unit = inst->Src[1].Register.Index;
       struct etna_inst ins[2] = { };
       struct etna_native_reg temp = etna_compile_get_inner_temp(c);

       ins[0].opcode = INST_OPCODE_MUL;
       ins[0].dst = etna_native_to_dst(temp, INST_COMPS_X);
       ins[0].src[0] = src[0];
       ins[0].src[1] = alloc_imm(c, ETNA_IMMEDIATE_TEXRECT_SCALE_X, unit);

       ins[1].opcode = INST_OPCODE_MUL;
       ins[1].dst = etna_native_to_dst(temp, INST_COMPS_Y);
       ins[1].src[0] = src[0];
       ins[1].src[1] = alloc_imm(c, ETNA_IMMEDIATE_TEXRECT_SCALE_Y, unit);

       emit_inst(c, &ins[0]);
       emit_inst(c, &ins[1]);

       src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY); /* temp.xyzw */
    }

    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_TEX:
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_TEXLD,
          .sat = 0,
          .dst = convert_dst(c, &inst->Dst[0]),
          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
          .src[0] = src[0],
       });
       break;

    case TGSI_OPCODE_TXB:
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_TEXLDB,
          .sat = 0,
          .dst = convert_dst(c, &inst->Dst[0]),
          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
          .src[0] = src[0],
       });
       break;

    case TGSI_OPCODE_TXL:
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_TEXLDL,
          .sat = 0,
          .dst = convert_dst(c, &inst->Dst[0]),
          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
          .src[0] = src[0],
       });
       break;

    case TGSI_OPCODE_TXP: { /* divide src.xyz by src.w */
       struct etna_native_reg temp = etna_compile_get_inner_temp(c);

       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_RCP,
          .sat = 0,
          .dst = etna_native_to_dst(temp, INST_COMPS_W), /* tmp.w */
          .src[2] = swizzle(src[0], SWIZZLE(W, W, W, W)),
       });
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_MUL,
          .sat = 0,
          .dst = etna_native_to_dst(temp, INST_COMPS_X | INST_COMPS_Y |
                                          INST_COMPS_Z), /* tmp.xyz */
          .src[0] = etna_native_to_src(temp, SWIZZLE(W, W, W, W)),
          .src[1] = src[0], /* src.xyzw */
       });
       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_TEXLD,
          .sat = 0,
          .dst = convert_dst(c, &inst->Dst[0]),
          .tex = convert_tex(c, &inst->Src[1], &inst->Texture),
          .src[0] = etna_native_to_src(temp, INST_SWIZ_IDENTITY), /* tmp.xyzw */
       });
    } break;

    default:
       BUG("Unhandled instruction %s",
           tgsi_get_opcode_name(inst->Instruction.Opcode));
       assert(0);
       break;
    }
 }

 static void
 trans_dummy(const struct instr_translater *t, struct etna_compile *c,
             const struct tgsi_full_instruction *inst, struct etna_inst_src *src)
 {
    /* nothing to do */
 }

 static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
 #define INSTR(n, f, ...) \
    [TGSI_OPCODE_##n] = {.fxn = (f), .tgsi_opc = TGSI_OPCODE_##n, ##__VA_ARGS__}

    INSTR(MOV, trans_instr, .opc = INST_OPCODE_MOV, .src = {2, -1, -1}),
    INSTR(RCP, trans_instr, .opc = INST_OPCODE_RCP, .src = {2, -1, -1}),
    INSTR(RSQ, trans_instr, .opc = INST_OPCODE_RSQ, .src = {2, -1, -1}),
    INSTR(MUL, trans_instr, .opc = INST_OPCODE_MUL, .src = {0, 1, -1}),
    INSTR(ADD, trans_instr, .opc = INST_OPCODE_ADD, .src = {0, 2, -1}),
    INSTR(DP2, trans_instr, .opc = INST_OPCODE_DP2, .src = {0, 1, -1}),
    INSTR(DP3, trans_instr, .opc = INST_OPCODE_DP3, .src = {0, 1, -1}),
    INSTR(DP4, trans_instr, .opc = INST_OPCODE_DP4, .src = {0, 1, -1}),
    INSTR(DST, trans_instr, .opc = INST_OPCODE_DST, .src = {0, 1, -1}),
    INSTR(MAD, trans_instr, .opc = INST_OPCODE_MAD, .src = {0, 1, 2}),
    INSTR(EX2, trans_instr, .opc = INST_OPCODE_EXP, .src = {2, -1, -1}),
    INSTR(LG2, trans_lg2),
    INSTR(SQRT, trans_instr, .opc = INST_OPCODE_SQRT, .src = {2, -1, -1}),
    INSTR(FRC, trans_instr, .opc = INST_OPCODE_FRC, .src = {2, -1, -1}),
    INSTR(CEIL, trans_instr, .opc = INST_OPCODE_CEIL, .src = {2, -1, -1}),
    INSTR(FLR, trans_instr, .opc = INST_OPCODE_FLOOR, .src = {2, -1, -1}),
    INSTR(CMP, trans_instr, .opc = INST_OPCODE_SELECT, .src = {0, 1, 2}, .cond = INST_CONDITION_LZ),

    INSTR(KILL, trans_instr, .opc = INST_OPCODE_TEXKILL),
    INSTR(KILL_IF, trans_instr, .opc = INST_OPCODE_TEXKILL, .src = {0, -1, -1}, .cond = INST_CONDITION_LZ),

    INSTR(DDX, trans_deriv, .opc = INST_OPCODE_DSX),
    INSTR(DDY, trans_deriv, .opc = INST_OPCODE_DSY),

    INSTR(IF, trans_if),
    INSTR(ELSE, trans_else),
    INSTR(ENDIF, trans_endif),

    INSTR(BGNLOOP, trans_loop_bgn),
    INSTR(ENDLOOP, trans_loop_end),
    INSTR(BRK, trans_brk),
    INSTR(CONT, trans_cont),

    INSTR(MIN, trans_min_max, .opc = INST_OPCODE_SELECT, .cond = INST_CONDITION_GT),
    INSTR(MAX, trans_min_max, .opc = INST_OPCODE_SELECT, .cond = INST_CONDITION_LT),

    INSTR(ARL, trans_arl),
    INSTR(LRP, trans_lrp),
    INSTR(LIT, trans_lit),
    INSTR(SSG, trans_ssg),

    INSTR(SIN, trans_trig),
    INSTR(COS, trans_trig),

    INSTR(SLT, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_LT),
    INSTR(SGE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_GE),
    INSTR(SEQ, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_EQ),
    INSTR(SGT, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_GT),
    INSTR(SLE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_LE),
    INSTR(SNE, trans_instr, .opc = INST_OPCODE_SET, .src = {0, 1, -1}, .cond = INST_CONDITION_NE),

    INSTR(TEX, trans_sampler),
    INSTR(TXB, trans_sampler),
    INSTR(TXL, trans_sampler),
    INSTR(TXP, trans_sampler),

    INSTR(NOP, trans_dummy),
    INSTR(END, trans_dummy),
 };

 /* Pass -- compile instructions */
 static void
 etna_compile_pass_generate_code(struct etna_compile *c)
 {
    struct tgsi_parse_context ctx = { };
    unsigned status = tgsi_parse_init(&ctx, c->tokens);
    assert(status == TGSI_PARSE_OK);

    int inst_idx = 0;
    while (!tgsi_parse_end_of_tokens(&ctx)) {
       const struct tgsi_full_instruction *inst = 0;

       /* No inner temps used yet for this instruction, clear counter */
       c->inner_temps = 0;

       tgsi_parse_token(&ctx);

       switch (ctx.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_INSTRUCTION:
          /* iterate over operands */
          inst = &ctx.FullToken.FullInstruction;
          if (c->dead_inst[inst_idx]) { /* skip dead instructions */
             inst_idx++;
             continue;
          }

          /* Lookup the TGSI information and generate the source arguments */
          struct etna_inst_src src[ETNA_NUM_SRC];
          memset(src, 0, sizeof(src));

          const struct tgsi_opcode_info *tgsi = tgsi_get_opcode_info(inst->Instruction.Opcode);

          for (int i = 0; i < tgsi->num_src && i < ETNA_NUM_SRC; i++) {
             const struct tgsi_full_src_register *reg = &inst->Src[i];
             const struct etna_native_reg *n = &etna_get_src_reg(c, reg->Register)->native;

             if (!n->valid || n->is_tex)
                continue;

             src[i] = etna_create_src(reg, n);
          }

          const unsigned opc = inst->Instruction.Opcode;
          const struct instr_translater *t = &translaters[opc];

          if (t->fxn) {
             t->fxn(t, c, inst, src);

             inst_idx += 1;
          } else {
             BUG("Unhandled instruction %s", tgsi_get_opcode_name(opc));
             assert(0);
          }
          break;
       }
    }
    tgsi_parse_free(&ctx);
 }

 /* Look up register by semantic */
 static struct etna_reg_desc *
 find_decl_by_semantic(struct etna_compile *c, uint file, uint name, uint index)
 {
    for (int idx = 0; idx < c->file[file].reg_size; ++idx) {
       struct etna_reg_desc *reg = &c->file[file].reg[idx];

       if (reg->semantic.Name == name && reg->semantic.Index == index)
          return reg;
    }

    return NULL; /* not found */
 }

 /** Add ADD and MUL instruction to bring Z/W to 0..1 if -1..1 if needed:
  * - this is a vertex shader
  * - and this is an older GPU
  */
 static void
 etna_compile_add_z_div_if_needed(struct etna_compile *c)
 {
    if (c->info.processor == PIPE_SHADER_VERTEX && c->specs->vs_need_z_div) {
       /* find position out */
       struct etna_reg_desc *pos_reg =
          find_decl_by_semantic(c, TGSI_FILE_OUTPUT, TGSI_SEMANTIC_POSITION, 0);

       if (pos_reg != NULL) {
          /*
           * ADD tX.__z_, tX.zzzz, void, tX.wwww
           * MUL tX.__z_, tX.zzzz, 0.5, void
          */
          emit_inst(c, &(struct etna_inst) {
             .opcode = INST_OPCODE_ADD,
             .dst = etna_native_to_dst(pos_reg->native, INST_COMPS_Z),
             .src[0] = etna_native_to_src(pos_reg->native, SWIZZLE(Z, Z, Z, Z)),
             .src[2] = etna_native_to_src(pos_reg->native, SWIZZLE(W, W, W, W)),
          });
          emit_inst(c, &(struct etna_inst) {
             .opcode = INST_OPCODE_MUL,
             .dst = etna_native_to_dst(pos_reg->native, INST_COMPS_Z),
             .src[0] = etna_native_to_src(pos_reg->native, SWIZZLE(Z, Z, Z, Z)),
             .src[1] = alloc_imm_f32(c, 0.5f),
          });
       }
    }
 }

 static void
 etna_compile_frag_rb_swap(struct etna_compile *c)
 {
    if (c->info.processor == PIPE_SHADER_FRAGMENT && c->key->frag_rb_swap) {
       /* find color out */
       struct etna_reg_desc *color_reg =
          find_decl_by_semantic(c, TGSI_FILE_OUTPUT, TGSI_SEMANTIC_COLOR, 0);

       emit_inst(c, &(struct etna_inst) {
          .opcode = INST_OPCODE_MOV,
          .dst = etna_native_to_dst(color_reg->native, INST_COMPS_X | INST_COMPS_Y | INST_COMPS_Z | INST_COMPS_W),
          .src[2] = etna_native_to_src(color_reg->native, SWIZZLE(Z, Y, X, W)),
       });
    }
 }

 /** add a NOP to the shader if
  * a) the shader is empty
  * or
  * b) there is a label at the end of the shader
  */
 static void
 etna_compile_add_nop_if_needed(struct etna_compile *c)
 {
    bool label_at_last_inst = false;

    for (int idx = 0; idx < c->labels_count; ++idx) {
       if (c->labels[idx].inst_idx == c->inst_ptr)
          label_at_last_inst = true;

    }

    if (c->inst_ptr == 0 || label_at_last_inst)
       emit_inst(c, &(struct etna_inst){.opcode = INST_OPCODE_NOP});
 }

 static void
 assign_uniforms(struct etna_compile_file *file, unsigned base)
 {
    for (int idx = 0; idx < file->reg_size; ++idx) {
       file->reg[idx].native.valid = 1;
       file->reg[idx].native.rgroup = INST_RGROUP_UNIFORM_0;
       file->reg[idx].native.id = base + idx;
    }
 }

 /* Allocate CONST and IMM to native ETNA_RGROUP_UNIFORM(x).
  * CONST must be consecutive as const buffers are supposed to be consecutive,
  * and before IMM, as this is
  * more convenient because is possible for the compilation process itself to
  * generate extra
  * immediates for constants such as pi, one, zero.
  */
 static void
 assign_constants_and_immediates(struct etna_compile *c)
 {
    assign_uniforms(&c->file[TGSI_FILE_CONSTANT], 0);
    /* immediates start after the constants */
    c->imm_base = c->file[TGSI_FILE_CONSTANT].reg_size * 4;
    assign_uniforms(&c->file[TGSI_FILE_IMMEDIATE], c->imm_base / 4);
    DBG_F(ETNA_DBG_COMPILER_MSGS, "imm base: %i size: %i", c->imm_base,
          c->imm_size);
 }

 /* Assign declared samplers to native texture units */
 static void
 assign_texture_units(struct etna_compile *c)
 {
    uint tex_base = 0;

    if (c->info.processor == PIPE_SHADER_VERTEX)
       tex_base = c->specs->vertex_sampler_offset;

    for (int idx = 0; idx < c->file[TGSI_FILE_SAMPLER].reg_size; ++idx) {
       c->file[TGSI_FILE_SAMPLER].reg[idx].native.valid = 1;
       c->file[TGSI_FILE_SAMPLER].reg[idx].native.is_tex = 1; // overrides rgroup
       c->file[TGSI_FILE_SAMPLER].reg[idx].native.id = tex_base + idx;
    }
 }

 /* Additional pass to fill in branch targets. This pass should be last
  * as no instruction reordering or removing/addition can be done anymore
  * once the branch targets are computed.
  */
 static void
 etna_compile_fill_in_labels(struct etna_compile *c)
 {
    for (int idx = 0; idx < c->inst_ptr; ++idx) {
       if (c->lbl_usage[idx] != -1)
          etna_assemble_set_imm(&c->code[idx * 4],
                                c->labels[c->lbl_usage[idx]].inst_idx);
    }
 }

 /* compare two etna_native_reg structures, return true if equal */
 static bool
 cmp_etna_native_reg(const struct etna_native_reg to,
                     const struct etna_native_reg from)
 {
    return to.valid == from.valid && to.is_tex == from.is_tex &&
           to.rgroup == from.rgroup && to.id == from.id;
 }

 /* go through all declarations and swap native registers *to* and *from* */
 static void
 swap_native_registers(struct etna_compile *c, const struct etna_native_reg to,
                       const struct etna_native_reg from)
 {
    if (cmp_etna_native_reg(from, to))
       return; /* Nothing to do */

    for (int idx = 0; idx < c->total_decls; ++idx) {
       if (cmp_etna_native_reg(c->decl[idx].native, from)) {
          c->decl[idx].native = to;
       } else if (cmp_etna_native_reg(c->decl[idx].native, to)) {
          c->decl[idx].native = from;
       }
    }
 }

 /* For PS we need to permute so that inputs are always in temporary 0..N-1.
  * Semantic POS is always t0. If that semantic is not used, avoid t0.
  */
 static void
 permute_ps_inputs(struct etna_compile *c)
 {
    /* Special inputs:
     * gl_FragCoord  VARYING_SLOT_POS   TGSI_SEMANTIC_POSITION
     * gl_PointCoord VARYING_SLOT_PNTC  TGSI_SEMANTIC_PCOORD
     */
    uint native_idx = 1;

    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];
       uint input_id;
       assert(reg->has_semantic);

       if (!reg->active || reg->semantic.Name == TGSI_SEMANTIC_POSITION)
          continue;

       input_id = native_idx++;
       swap_native_registers(c, etna_native_temp(input_id),
                             c->file[TGSI_FILE_INPUT].reg[idx].native);
    }

    c->num_varyings = native_idx - 1;

    if (native_idx > c->next_free_native)
       c->next_free_native = native_idx;
 }

 /* fill in ps inputs into shader object */
 static void
 fill_in_ps_inputs(struct etna_shader_variant *sobj, struct etna_compile *c)
 {
    struct etna_shader_io_file *sf = &sobj->infile;

    sf->num_reg = 0;

    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];

       if (reg->native.id > 0) {
          assert(sf->num_reg < ETNA_NUM_INPUTS);
          sf->reg[sf->num_reg].reg = reg->native.id;
          sf->reg[sf->num_reg].semantic = reg->semantic;
          /* convert usage mask to number of components (*=wildcard)
           *   .r    (0..1)  -> 1 component
           *   .*g   (2..3)  -> 2 component
           *   .**b  (4..7)  -> 3 components
           *   .***a (8..15) -> 4 components
           */
          sf->reg[sf->num_reg].num_components = util_last_bit(reg->usage_mask);
          sf->num_reg++;
       }
    }

    assert(sf->num_reg == c->num_varyings);
    sobj->input_count_unk8 = 31; /* XXX what is this */
 }

 /* fill in output mapping for ps into shader object */
 static void
 fill_in_ps_outputs(struct etna_shader_variant *sobj, struct etna_compile *c)
 {
    sobj->outfile.num_reg = 0;

    for (int idx = 0; idx < c->file[TGSI_FILE_OUTPUT].reg_size; ++idx) {
       struct etna_reg_desc *reg = &c->file[TGSI_FILE_OUTPUT].reg[idx];

       switch (reg->semantic.Name) {
       case TGSI_SEMANTIC_COLOR: /* FRAG_RESULT_COLOR */
          sobj->ps_color_out_reg = reg->native.id;
          break;
       case TGSI_SEMANTIC_POSITION: /* FRAG_RESULT_DEPTH */
          sobj->ps_depth_out_reg = reg->native.id; /* =always native reg 0, only z component should be assigned */
          break;
       default:
          assert(0); /* only outputs supported are COLOR and POSITION at the moment */
       }
    }
 }

 /* fill in inputs for vs into shader object */
 static void
 fill_in_vs_inputs(struct etna_shader_variant *sobj, struct etna_compile *c)
 {
    struct etna_shader_io_file *sf = &sobj->infile;

    sf->num_reg = 0;
    for (int idx = 0; idx < c->file[TGSI_FILE_INPUT].reg_size; ++idx) {
       struct etna_reg_desc *reg = &c->file[TGSI_FILE_INPUT].reg[idx];
       assert(sf->num_reg < ETNA_NUM_INPUTS);

       if (!reg->native.valid)
          continue;

       /* XXX exclude inputs with special semantics such as gl_frontFacing */
       sf->reg[sf->num_reg].reg = reg->native.id;
       sf->reg[sf->num_reg].semantic = reg->semantic;
       sf->reg[sf->num_reg].num_components = util_last_bit(reg->usage_mask);
       sf->num_reg++;
    }

    sobj->input_count_unk8 = (sf->num_reg + 19) / 16; /* XXX what is this */
 }

 /* build two-level output index [Semantic][Index] for fast linking */
 static void
 build_output_index(struct etna_shader_variant *sobj)
 {
    int total = 0;
    int offset = 0;

    for (int name = 0; name < TGSI_SEMANTIC_COUNT; ++name)
       total += sobj->output_count_per_semantic[name];

    sobj->output_per_semantic_list = CALLOC(total, sizeof(struct etna_shader_inout *));

    for (int name = 0; name < TGSI_SEMANTIC_COUNT; ++name) {
       sobj->output_per_semantic[name] = &sobj->output_per_semantic_list[offset];
       offset += sobj->output_count_per_semantic[name];
    }

    for (int idx = 0; idx < sobj->outfile.num_reg; ++idx) {
       sobj->output_per_semantic[sobj->outfile.reg[idx].semantic.Name]
                                [sobj->outfile.reg[idx].semantic.Index] =
          &sobj->outfile.reg[idx];
    }
 }

 /* fill in outputs for vs into shader object */
 static void
 fill_in_vs_outputs(struct etna_shader_variant *sobj, struct etna_compile *c)
 {
    struct etna_shader_io_file *sf = &sobj->outfile;

    sf->num_reg = 0;
    for (int idx = 0; idx < c->file[TGSI_FILE_OUTPUT].reg_size; ++idx) {
       struct etna_reg_desc *reg = &c->file[TGSI_FILE_OUTPUT].reg[idx];
       assert(sf->num_reg < ETNA_NUM_INPUTS);

       switch (reg->semantic.Name) {
       case TGSI_SEMANTIC_POSITION:
          sobj->vs_pos_out_reg = reg->native.id;
          break;
       case TGSI_SEMANTIC_PSIZE:
          sobj->vs_pointsize_out_reg = reg->native.id;
          break;
       default:
          sf->reg[sf->num_reg].reg = reg->native.id;
          sf->reg[sf->num_reg].semantic = reg->semantic;
          sf->reg[sf->num_reg].num_components = 4; // XXX reg->num_components;
          sf->num_reg++;
          sobj->output_count_per_semantic[reg->semantic.Name] =
             MAX2(reg->semantic.Index + 1,
                  sobj->output_count_per_semantic[reg->semantic.Name]);
       }
    }

    /* build two-level index for linking */
    build_output_index(sobj);

    /* fill in "mystery meat" load balancing value. This value determines how
     * work is scheduled between VS and PS
     * in the unified shader architecture. More precisely, it is determined from
     * the number of VS outputs, as well as chip-specific
     * vertex output buffer size, vertex cache size, and the number of shader
     * cores.
     *
     * XXX this is a conservative estimate, the "optimal" value is only known for
     * sure at link time because some
     * outputs may be unused and thus unmapped. Then again, in the general use
     * case with GLSL the vertex and fragment
     * shaders are linked already before submitting to Gallium, thus all outputs
     * are used.
     */
    int half_out = (c->file[TGSI_FILE_OUTPUT].reg_size + 1) / 2;
    assert(half_out);

    uint32_t b = ((20480 / (c->specs->vertex_output_buffer_size -
                            2 * half_out * c->specs->vertex_cache_size)) +
                  9) /
                 10;
    uint32_t a = (b + 256 / (c->specs->shader_core_count * half_out)) / 2;
    sobj->vs_load_balancing = VIVS_VS_LOAD_BALANCING_A(MIN2(a, 255)) |
                              VIVS_VS_LOAD_BALANCING_B(MIN2(b, 255)) |
                              VIVS_VS_LOAD_BALANCING_C(0x3f) |
                              VIVS_VS_LOAD_BALANCING_D(0x0f);
 }

 static bool
 etna_compile_check_limits(struct etna_compile *c)
 {
    int max_uniforms = (c->info.processor == PIPE_SHADER_VERTEX)
                          ? c->specs->max_vs_uniforms
                          : c->specs->max_ps_uniforms;
    /* round up number of uniforms, including immediates, in units of four */
    int num_uniforms = c->imm_base / 4 + (c->imm_size + 3) / 4;

    if (!c->specs->has_icache && c->inst_ptr > c->specs->max_instructions) {
       DBG("Number of instructions (%d) exceeds maximum %d", c->inst_ptr,
           c->specs->max_instructions);
       return false;
    }

    if (c->next_free_native > c->specs->max_registers) {
       DBG("Number of registers (%d) exceeds maximum %d", c->next_free_native,
           c->specs->max_registers);
       return false;
    }

    if (num_uniforms > max_uniforms) {
       DBG("Number of uniforms (%d) exceeds maximum %d", num_uniforms,
           max_uniforms);
       return false;
    }

    if (c->num_varyings > c->specs->max_varyings) {
       DBG("Number of varyings (%d) exceeds maximum %d", c->num_varyings,
           c->specs->max_varyings);
       return false;
    }

    if (c->imm_base > c->specs->num_constants) {
       DBG("Number of constants (%d) exceeds maximum %d", c->imm_base,
           c->specs->num_constants);
    }

    return true;
 }

 static void
 copy_uniform_state_to_shader(struct etna_compile *c, struct etna_shader_variant *sobj)
 {
    uint32_t count = c->imm_size;
    struct etna_shader_uniform_info *uinfo = &sobj->uniforms;

    uinfo->const_count = c->imm_base;
    uinfo->imm_count = count;
    uinfo->imm_data = mem_dup(c->imm_data, count * sizeof(*c->imm_data));
    uinfo->imm_contents = mem_dup(c->imm_contents, count * sizeof(*c->imm_contents));

    etna_set_shader_uniforms_dirty_flags(sobj);
 }

 bool
 etna_compile_shader(struct etna_shader_variant *v)
 {
    /* Create scratch space that may be too large to fit on stack
     */
    bool ret;
    struct etna_compile *c;

    if (unlikely(!v))
       return false;

    const struct etna_specs *specs = v->shader->specs;

    struct tgsi_lowering_config lconfig = {
       .lower_FLR = !specs->has_sign_floor_ceil,
       .lower_CEIL = !specs->has_sign_floor_ceil,
       .lower_POW = true,
       .lower_EXP = true,
       .lower_LOG = true,
       .lower_DP2 = !specs->has_halti2_instructions,
       .lower_TRUNC = true,
    };

    c = CALLOC_STRUCT(etna_compile);
    if (!c)
       return false;

    memset(&c->lbl_usage, -1, sizeof(c->lbl_usage));

    const struct tgsi_token *tokens = v->shader->tokens;

    c->specs = specs;
    c->key = &v->key;
    c->tokens = tgsi_transform_lowering(&lconfig, tokens, &c->info);
    c->free_tokens = !!c->tokens;
    if (!c->tokens) {
       /* no lowering */
       c->tokens = tokens;
    }

    /* Build a map from gallium register to native registers for files
     * CONST, SAMP, IMM, OUT, IN, TEMP.
     * SAMP will map as-is for fragment shaders, there will be a +8 offset for
     * vertex shaders.
     */
    /* Pass one -- check register file declarations and immediates */
    etna_compile_parse_declarations(c);

    etna_allocate_decls(c);

    /* Pass two -- check usage of temporaries, inputs, outputs */
    etna_compile_pass_check_usage(c);

    assign_special_inputs(c);

    /* Assign native temp register to TEMPs */
    assign_temporaries_to_native(c, &c->file[TGSI_FILE_TEMPORARY]);

    /* optimize outputs */
    etna_compile_pass_optimize_outputs(c);

    /* XXX assign special inputs: gl_FrontFacing (VARYING_SLOT_FACE)
     *     this is part of RGROUP_INTERNAL
     */

    /* assign inputs: last usage of input should be <= first usage of temp */
    /*   potential optimization case:
     *     if single MOV TEMP[y], IN[x] before which temp y is not used, and
     * after which IN[x]
     *     is not read, temp[y] can be used as input register as-is
     */
    /*   sort temporaries by first use
     *   sort inputs by last usage
     *   iterate over inputs, temporaries
     *     if last usage of input <= first usage of temp:
     *       assign input to temp
     *       advance input, temporary pointer
     *     else
     *       advance temporary pointer
     *
     *   potential problem: instruction with multiple inputs of which one is the
     * temp and the other is the input;
     *      however, as the temp is not used before this, how would this make
     * sense? uninitialized temporaries have an undefined
     *      value, so this would be ok
     */
    assign_inouts_to_temporaries(c, TGSI_FILE_INPUT);

    /* assign outputs: first usage of output should be >= last usage of temp */
    /*   potential optimization case:
     *      if single MOV OUT[x], TEMP[y] (with full write mask, or at least
     * writing all components that are used in
     *        the shader) after which temp y is no longer used temp[y] can be
     * used as output register as-is
     *
     *   potential problem: instruction with multiple outputs of which one is the
     * temp and the other is the output;
     *      however, as the temp is not used after this, how would this make
     * sense? could just discard the output value
     */
    /*   sort temporaries by last use
     *   sort outputs by first usage
     *   iterate over outputs, temporaries
     *     if first usage of output >= last usage of temp:
     *       assign output to temp
     *       advance output, temporary pointer
     *     else
     *       advance temporary pointer
     */
    assign_inouts_to_temporaries(c, TGSI_FILE_OUTPUT);

    assign_constants_and_immediates(c);
    assign_texture_units(c);

    /* list declarations */
    for (int x = 0; x < c->total_decls; ++x) {
       DBG_F(ETNA_DBG_COMPILER_MSGS, "%i: %s,%d active=%i first_use=%i "
                                     "last_use=%i native=%i usage_mask=%x "
                                     "has_semantic=%i",
             x, tgsi_file_name(c->decl[x].file), c->decl[x].idx,
             c->decl[x].active, c->decl[x].first_use, c->decl[x].last_use,
             c->decl[x].native.valid ? c->decl[x].native.id : -1,
             c->decl[x].usage_mask, c->decl[x].has_semantic);
       if (c->decl[x].has_semantic)
          DBG_F(ETNA_DBG_COMPILER_MSGS, " semantic_name=%s semantic_idx=%i",
                tgsi_semantic_names[c->decl[x].semantic.Name],
                c->decl[x].semantic.Index);
    }
    /* XXX for PS we need to permute so that inputs are always in temporary
     * 0..N-1.
     * There is no "switchboard" for varyings (AFAIK!). The output color,
     * however, can be routed
     * from an arbitrary temporary.
     */
    if (c->info.processor == PIPE_SHADER_FRAGMENT)
       permute_ps_inputs(c);


    /* list declarations */
    for (int x = 0; x < c->total_decls; ++x) {
       DBG_F(ETNA_DBG_COMPILER_MSGS, "%i: %s,%d active=%i first_use=%i "
                                     "last_use=%i native=%i usage_mask=%x "
                                     "has_semantic=%i",
             x, tgsi_file_name(c->decl[x].file), c->decl[x].idx,
             c->decl[x].active, c->decl[x].first_use, c->decl[x].last_use,
             c->decl[x].native.valid ? c->decl[x].native.id : -1,
             c->decl[x].usage_mask, c->decl[x].has_semantic);
       if (c->decl[x].has_semantic)
          DBG_F(ETNA_DBG_COMPILER_MSGS, " semantic_name=%s semantic_idx=%i",
                tgsi_semantic_names[c->decl[x].semantic.Name],
                c->decl[x].semantic.Index);
    }

    /* pass 3: generate instructions */
    etna_compile_pass_generate_code(c);
    etna_compile_add_z_div_if_needed(c);
    etna_compile_frag_rb_swap(c);
    etna_compile_add_nop_if_needed(c);

    ret = etna_compile_check_limits(c);
    if (!ret)
       goto out;

    etna_compile_fill_in_labels(c);

    /* fill in output structure */
    v->processor = c->info.processor;
    v->code_size = c->inst_ptr * 4;
    v->code = mem_dup(c->code, c->inst_ptr * 16);
    v->num_loops = c->num_loops;
    v->num_temps = c->next_free_native;
    v->vs_pos_out_reg = -1;
    v->vs_pointsize_out_reg = -1;
    v->ps_color_out_reg = -1;
    v->ps_depth_out_reg = -1;
    v->needs_icache = c->inst_ptr > c->specs->max_instructions;
    copy_uniform_state_to_shader(c, v);

    if (c->info.processor == PIPE_SHADER_VERTEX) {
       fill_in_vs_inputs(v, c);
       fill_in_vs_outputs(v, c);
    } else if (c->info.processor == PIPE_SHADER_FRAGMENT) {
       fill_in_ps_inputs(v, c);
       fill_in_ps_outputs(v, c);
    }

 out:
    if (c->free_tokens)
       FREE((void *)c->tokens);

    FREE(c->labels);
    FREE(c);

    return ret;
 }

 extern const char *tgsi_swizzle_names[];
 void
 etna_dump_shader(const struct etna_shader_variant *shader)
 {
    if (shader->processor == PIPE_SHADER_VERTEX)
       printf("VERT\n");
    else
       printf("FRAG\n");


    etna_disasm(shader->code, shader->code_size, PRINT_RAW);

    printf("num loops: %i\n", shader->num_loops);
    printf("num temps: %i\n", shader->num_temps);
    printf("num const: %i\n", shader->uniforms.const_count);
    printf("immediates:\n");
    for (int idx = 0; idx < shader->uniforms.imm_count; ++idx) {
       printf(" [%i].%s = %f (0x%08x)\n",
              (idx + shader->uniforms.const_count) / 4,
              tgsi_swizzle_names[idx % 4],
              *((float *)&shader->uniforms.imm_data[idx]),
              shader->uniforms.imm_data[idx]);
    }
    printf("inputs:\n");
    for (int idx = 0; idx < shader->infile.num_reg; ++idx) {
       printf(" [%i] name=%s index=%i comps=%i\n", shader->infile.reg[idx].reg,
              tgsi_semantic_names[shader->infile.reg[idx].semantic.Name],
              shader->infile.reg[idx].semantic.Index,
              shader->infile.reg[idx].num_components);
    }
    printf("outputs:\n");
    for (int idx = 0; idx < shader->outfile.num_reg; ++idx) {
       printf(" [%i] name=%s index=%i comps=%i\n", shader->outfile.reg[idx].reg,
              tgsi_semantic_names[shader->outfile.reg[idx].semantic.Name],
              shader->outfile.reg[idx].semantic.Index,
              shader->outfile.reg[idx].num_components);
    }
    printf("special:\n");
    if (shader->processor == PIPE_SHADER_VERTEX) {
       printf("  vs_pos_out_reg=%i\n", shader->vs_pos_out_reg);
       printf("  vs_pointsize_out_reg=%i\n", shader->vs_pointsize_out_reg);
       printf("  vs_load_balancing=0x%08x\n", shader->vs_load_balancing);
    } else {
       printf("  ps_color_out_reg=%i\n", shader->ps_color_out_reg);
       printf("  ps_depth_out_reg=%i\n", shader->ps_depth_out_reg);
    }
    printf("  input_count_unk8=0x%08x\n", shader->input_count_unk8);
 }

 void
 etna_destroy_shader(struct etna_shader_variant *shader)
 {
    assert(shader);

    FREE(shader->code);
    FREE(shader->uniforms.imm_data);
    FREE(shader->uniforms.imm_contents);
    FREE(shader->output_per_semantic_list);
    FREE(shader);
 }

 static const struct etna_shader_inout *
 etna_shader_vs_lookup(const struct etna_shader_variant *sobj,
                       const struct etna_shader_inout *in)
 {
    if (in->semantic.Index < sobj->output_count_per_semantic[in->semantic.Name])
       return sobj->output_per_semantic[in->semantic.Name][in->semantic.Index];

    return NULL;
 }

 bool
 etna_link_shader(struct etna_shader_link_info *info,
                  const struct etna_shader_variant *vs, const struct etna_shader_variant *fs)
 {
    int comp_ofs = 0;
    /* For each fragment input we need to find the associated vertex shader
     * output, which can be found by matching on semantic name and index. A
     * binary search could be used because the vs outputs are sorted by their
     * semantic index and grouped by semantic type by fill_in_vs_outputs.
     */
    assert(fs->infile.num_reg < ETNA_NUM_INPUTS);
    info->pcoord_varying_comp_ofs = -1;

    for (int idx = 0; idx < fs->infile.num_reg; ++idx) {
       const struct etna_shader_inout *fsio = &fs->infile.reg[idx];
       const struct etna_shader_inout *vsio = etna_shader_vs_lookup(vs, fsio);
       struct etna_varying *varying;
       bool interpolate_always = fsio->semantic.Name != TGSI_SEMANTIC_COLOR;

       assert(fsio->reg > 0 && fsio->reg <= ARRAY_SIZE(info->varyings));

       if (fsio->reg > info->num_varyings)
          info->num_varyings = fsio->reg;

       varying = &info->varyings[fsio->reg - 1];
       varying->num_components = fsio->num_components;

       if (!interpolate_always) /* colors affected by flat shading */
          varying->pa_attributes = 0x200;
       else /* texture coord or other bypasses flat shading */
          varying->pa_attributes = 0x2f1;

       varying->use[0] = interpolate_always ? VARYING_COMPONENT_USE_POINTCOORD_X : VARYING_COMPONENT_USE_USED;
       varying->use[1] = interpolate_always ? VARYING_COMPONENT_USE_POINTCOORD_Y : VARYING_COMPONENT_USE_USED;
       varying->use[2] = VARYING_COMPONENT_USE_USED;
       varying->use[3] = VARYING_COMPONENT_USE_USED;


       /* point coord is an input to the PS without matching VS output,
        * so it gets a varying slot without being assigned a VS register.
        */
       if (fsio->semantic.Name == TGSI_SEMANTIC_PCOORD) {
          info->pcoord_varying_comp_ofs = comp_ofs;
       } else {
          if (vsio == NULL) { /* not found -- link error */
             BUG("Semantic %d value %d not found in vertex shader outputs\n", fsio->semantic.Name, fsio->semantic.Index);
             return true;
          }

          varying->reg = vsio->reg;
       }

       comp_ofs += varying->num_components;
    }

    assert(info->num_varyings == fs->infile.num_reg);

    return false;
 }