src/compiler/glsl/gl_nir_lower_packed_varyings.c - third_party/mesa - Git at Google

 /*
  * Copyright © 2011 Intel Corporation
  * Copyright © 2022 Valve Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */

 /**
  * This lowering pass generates GLSL code that manually packs varyings into
  * vec4 slots, for the benefit of back-ends that don't support packed varyings
  * natively.
  *
  * For example, the following shader:
  *
  *   out mat3x2 foo;  // location=4, location_frac=0
  *   out vec3 bar[2]; // location=5, location_frac=2
  *
  *   main()
  *   {
  *     ...
  *   }
  *
  * Is rewritten to:
  *
  *   mat3x2 foo;
  *   vec3 bar[2];
  *   out vec4 packed4; // location=4, location_frac=0
  *   out vec4 packed5; // location=5, location_frac=0
  *   out vec4 packed6; // location=6, location_frac=0
  *
  *   main()
  *   {
  *     ...
  *     packed4.xy = foo[0];
  *     packed4.zw = foo[1];
  *     packed5.xy = foo[2];
  *     packed5.zw = bar[0].xy;
  *     packed6.x = bar[0].z;
  *     packed6.yzw = bar[1];
  *   }
  *
  * This lowering pass properly handles "double parking" of a varying vector
  * across two varying slots.  For example, in the code above, two of the
  * components of bar[0] are stored in packed5, and the remaining component is
  * stored in packed6.
  *
  * Note that in theory, the extra instructions may cause some loss of
  * performance.  However, hopefully in most cases the performance loss will
  * either be absorbed by a later optimization pass, or it will be offset by
  * memory bandwidth savings (because fewer varyings are used).
  *
  * This lowering pass also packs flat floats, ints, and uints together, by
  * using ivec4 as the base type of flat "varyings", and using appropriate
  * casts to convert floats and uints into ints.
  *
  * This lowering pass also handles varyings whose type is a struct or an array
  * of struct.  Structs are packed in order and with no gaps, so there may be a
  * performance penalty due to structure elements being double-parked.
  *
  * Lowering of geometry shader inputs is slightly more complex, since geometry
  * inputs are always arrays, so we need to lower arrays to arrays.  For
  * example, the following input:
  *
  *   in struct Foo {
  *     float f;
  *     vec3 v;
  *     vec2 a[2];
  *   } arr[3];         // location=4, location_frac=0
  *
  * Would get lowered like this if it occurred in a fragment shader:
  *
  *   struct Foo {
  *     float f;
  *     vec3 v;
  *     vec2 a[2];
  *   } arr[3];
  *   in vec4 packed4;  // location=4, location_frac=0
  *   in vec4 packed5;  // location=5, location_frac=0
  *   in vec4 packed6;  // location=6, location_frac=0
  *   in vec4 packed7;  // location=7, location_frac=0
  *   in vec4 packed8;  // location=8, location_frac=0
  *   in vec4 packed9;  // location=9, location_frac=0
  *
  *   main()
  *   {
  *     arr[0].f = packed4.x;
  *     arr[0].v = packed4.yzw;
  *     arr[0].a[0] = packed5.xy;
  *     arr[0].a[1] = packed5.zw;
  *     arr[1].f = packed6.x;
  *     arr[1].v = packed6.yzw;
  *     arr[1].a[0] = packed7.xy;
  *     arr[1].a[1] = packed7.zw;
  *     arr[2].f = packed8.x;
  *     arr[2].v = packed8.yzw;
  *     arr[2].a[0] = packed9.xy;
  *     arr[2].a[1] = packed9.zw;
  *     ...
  *   }
  *
  * But it would get lowered like this if it occurred in a geometry shader:
  *
  *   struct Foo {
  *     float f;
  *     vec3 v;
  *     vec2 a[2];
  *   } arr[3];
  *   in vec4 packed4[3];  // location=4, location_frac=0
  *   in vec4 packed5[3];  // location=5, location_frac=0
  *
  *   main()
  *   {
  *     arr[0].f = packed4[0].x;
  *     arr[0].v = packed4[0].yzw;
  *     arr[0].a[0] = packed5[0].xy;
  *     arr[0].a[1] = packed5[0].zw;
  *     arr[1].f = packed4[1].x;
  *     arr[1].v = packed4[1].yzw;
  *     arr[1].a[0] = packed5[1].xy;
  *     arr[1].a[1] = packed5[1].zw;
  *     arr[2].f = packed4[2].x;
  *     arr[2].v = packed4[2].yzw;
  *     arr[2].a[0] = packed5[2].xy;
  *     arr[2].a[1] = packed5[2].zw;
  *     ...
  *   }
  */

 #include "nir.h"
 #include "nir_builder.h"
 #include "gl_nir.h"
 #include "gl_nir_linker.h"
 #include "program/prog_instruction.h"
 #include "main/mtypes.h"

 /**
  * Visitor that performs varying packing.  For each varying declared in the
  * shader, this visitor determines whether it needs to be packed.  If so, it
  * demotes it to an ordinary global, creates new packed varyings, and
  * generates assignments to convert between the original varying and the
  * packed varying.
  */
 struct lower_packed_varyings_state
 {
    const struct gl_constants *consts;

    struct gl_shader_program *prog;

    /**
     * Memory context used to allocate new instructions for the shader.
     */
    void *mem_ctx;

    /**
     * Number of generic varying slots which are used by this shader.  This is
     * used to allocate temporary intermediate data structures.  If any varying
     * used by this shader has a location greater than or equal to
     * VARYING_SLOT_VAR0 + locations_used, an assertion will fire.
     */
    unsigned locations_used;

    const uint8_t* components;

    /**
     * Array of pointers to the packed varyings that have been created for each
     * generic varying slot.  NULL entries in this array indicate varying slots
     * for which a packed varying has not been created yet.
     */
    nir_variable **packed_varyings;

    nir_shader *shader;

    nir_function_impl *impl;

    nir_builder b;

    /**
     * Type of varying which is being lowered in this pass (either
     * nir_var_shader_in or ir_var_shader_out).
     */
    nir_variable_mode mode;

    /**
     * If we are currently lowering geometry shader inputs, the number of input
     * vertices the geometry shader accepts.  Otherwise zero.
     */
    unsigned gs_input_vertices;

    bool disable_varying_packing;
    bool disable_xfb_packing;
    bool xfb_enabled;
    bool ifc_exposed_to_query_api;
 };

 bool
 lower_packed_varying_needs_lowering(nir_shader *shader, nir_variable *var,
                                     bool xfb_enabled, bool disable_xfb_packing,
                                     bool disable_varying_packing)
 {
    /* Things composed of vec4's, varyings with explicitly assigned
     * locations or varyings marked as must_be_shader_input (which might be used
     * by interpolateAt* functions) shouldn't be lowered. Everything else can be.
     */
    if (var->data.explicit_location || var->data.must_be_shader_input)
       return false;

    const struct glsl_type *type = var->type;
    if (nir_is_arrayed_io(var, shader->info.stage)) {
       assert(glsl_type_is_array(type));
       type = glsl_get_array_element(type);
    }

    /* Some drivers (e.g. panfrost) don't support packing of transform
     * feedback varyings.
     */
    if (disable_xfb_packing && var->data.is_xfb &&
        !(glsl_type_is_array(type) || glsl_type_is_struct(type) || glsl_type_is_matrix(type)) &&
        xfb_enabled)
       return false;

    /* Override disable_varying_packing if the var is only used by transform
     * feedback. Also override it if transform feedback is enabled and the
     * variable is an array, struct or matrix as the elements of these types
     * will always have the same interpolation and therefore are safe to pack.
     */
    if (disable_varying_packing && !var->data.is_xfb_only &&
        !((glsl_type_is_array(type) || glsl_type_is_struct(type) || glsl_type_is_matrix(type)) &&
          xfb_enabled))
       return false;

    type = glsl_without_array(type);
    if (glsl_get_vector_elements(type) == 4 && !glsl_type_is_64bit(type))
       return false;
    return true;
 }

 /**
  * If no packed varying has been created for the given varying location yet,
  * create it and add it to the shader.
  *
  * The newly created varying inherits its interpolation parameters from \c
  * unpacked_var.  Its base type is ivec4 if we are lowering a flat varying,
  * vec4 otherwise.
  */
 static void
 create_or_update_packed_varying(struct lower_packed_varyings_state *state,
                                 nir_variable *unpacked_var,
                                 const char *name, unsigned location,
                                 unsigned slot, unsigned vertex_index)
 {
    assert(slot < state->locations_used);
    if (state->packed_varyings[slot] == NULL) {
       assert(state->components[slot] != 0);
       assert(name);

       nir_variable *packed_var = rzalloc(state->shader, nir_variable);
       packed_var->name = ralloc_asprintf(packed_var, "packed:%s", name);
       packed_var->data.mode = state->mode;

       bool is_interpolation_flat =
          unpacked_var->data.interpolation == INTERP_MODE_FLAT ||
          glsl_contains_integer(unpacked_var->type) ||
          glsl_contains_double(unpacked_var->type);

       const struct glsl_type *packed_type;
       if (is_interpolation_flat)
          packed_type = glsl_vector_type(GLSL_TYPE_INT, state->components[slot]);
       else
          packed_type = glsl_vector_type(GLSL_TYPE_FLOAT, state->components[slot]);

       if (state->gs_input_vertices != 0) {
          packed_type =
             glsl_array_type(packed_type, state->gs_input_vertices, 0);
       }

       packed_var->type = packed_type;
       packed_var->data.centroid = unpacked_var->data.centroid;
       packed_var->data.sample = unpacked_var->data.sample;
       packed_var->data.patch = unpacked_var->data.patch;
       packed_var->data.interpolation = is_interpolation_flat ?
          (unsigned) INTERP_MODE_FLAT : unpacked_var->data.interpolation;
       packed_var->data.location = location;
       packed_var->data.precision = unpacked_var->data.precision;
       packed_var->data.always_active_io = unpacked_var->data.always_active_io;
       packed_var->data.stream = NIR_STREAM_PACKED;

       nir_shader_add_variable(state->shader, packed_var);
       state->packed_varyings[slot] = packed_var;
    } else {
       nir_variable *var = state->packed_varyings[slot];

       /* The slot needs to be marked as always active if any variable that got
        * packed there was.
        */
       var->data.always_active_io |= unpacked_var->data.always_active_io;

       /* For geometry shader inputs, only update the packed variable name the
        * first time we visit each component.
        */
       if (state->gs_input_vertices == 0 || vertex_index == 0) {
          assert(name);
          ralloc_asprintf_append((char **) &var->name, ",%s", name);
       }
    }
 }

 /**
  * Retrieve the packed varying corresponding to the given varying location.
  *
  * \param vertex_index: if we are lowering geometry shader inputs, then this
  * indicates which vertex we are currently lowering.  Otherwise it is ignored.
  */
 static nir_deref_instr *
 get_packed_varying_deref(struct lower_packed_varyings_state *state,
                          unsigned location, nir_variable *unpacked_var,
                          const char *name, unsigned vertex_index)
 {
    unsigned slot = location - VARYING_SLOT_VAR0;
    assert(slot < state->locations_used);

    create_or_update_packed_varying(state, unpacked_var, name, location, slot,
                                    vertex_index);

    nir_deref_instr *deref =
       nir_build_deref_var(&state->b, state->packed_varyings[slot]);

    if (state->gs_input_vertices != 0) {
       /* When lowering GS inputs, the packed variable is an array, so we need
        * to dereference it using vertex_index.
        */
       nir_load_const_instr *c_idx =
          nir_load_const_instr_create(state->b.shader, 1, 32);
       c_idx->value[0].u32 = vertex_index;
       nir_builder_instr_insert(&state->b, &c_idx->instr);

       deref = nir_build_deref_array(&state->b, deref, &c_idx->def);
    }

    return deref;
 }

 struct packing_store_values {
    bool is_64bit;
    unsigned writemasks[2];
    nir_def *values[2];
    nir_deref_instr *deref;
 };

 /**
  * Make an ir_assignment from \c rhs to \c lhs, performing appropriate
  * bitcasts if necessary to match up types.
  *
  * This function is called when packing varyings.
  */
 static struct packing_store_values *
 bitwise_assign_pack(struct lower_packed_varyings_state *state,
                     nir_deref_instr *packed_deref,
                     nir_deref_instr *unpacked_deref,
                     const struct glsl_type *unpacked_type,
                     nir_def *value,
                     unsigned writemask)

 {
    nir_variable *packed_var = nir_deref_instr_get_variable(packed_deref);

    enum glsl_base_type packed_base_type = glsl_get_base_type(packed_var->type);
    enum glsl_base_type unpacked_base_type = glsl_get_base_type(unpacked_type);

    struct packing_store_values *store_state =
       calloc(1, sizeof(struct packing_store_values));

    if (unpacked_base_type != packed_base_type) {
       /* Since we only mix types in flat varyings, and we always store flat
        * varyings as type ivec4, we need only produce conversions from (uint
        * or float) to int.
        */
       assert(packed_base_type == GLSL_TYPE_INT);
       switch (unpacked_base_type) {
       case GLSL_TYPE_UINT:
       case GLSL_TYPE_FLOAT:
          value = nir_mov(&state->b, value);
          break;
       case GLSL_TYPE_DOUBLE:
       case GLSL_TYPE_UINT64:
       case GLSL_TYPE_INT64:
          assert(glsl_get_vector_elements(unpacked_type) <= 2);
          if (glsl_get_vector_elements(unpacked_type) == 2) {
             assert(glsl_get_vector_elements(packed_var->type) == 4);

             unsigned swiz_x = 0;
             unsigned writemask = 0x3;
             nir_def *swizzle = nir_swizzle(&state->b, value, &swiz_x, 1);

             store_state->is_64bit = true;
             store_state->deref = packed_deref;
             store_state->values[0] = nir_unpack_64_2x32(&state->b, swizzle);
             store_state->writemasks[0] = writemask;

             unsigned swiz_y = 1;
             writemask = 0xc;
             swizzle = nir_swizzle(&state->b, value, &swiz_y, 1);

             store_state->deref = packed_deref;
             store_state->values[1] = nir_unpack_64_2x32(&state->b, swizzle);
             store_state->writemasks[1] = writemask;
             return store_state;
          } else {
             value = nir_unpack_64_2x32(&state->b, value);
          }
          break;
       case GLSL_TYPE_SAMPLER:
       case GLSL_TYPE_IMAGE:
          value = nir_unpack_64_2x32(&state->b, value);
          break;
       default:
          assert(!"Unexpected type conversion while lowering varyings");
          break;
       }
    }

    store_state->deref = packed_deref;
    store_state->values[0] = value;
    store_state->writemasks[0] = writemask;

    return store_state;
 }

 /**
  * This function is called when unpacking varyings.
  */
 static struct packing_store_values *
 bitwise_assign_unpack(struct lower_packed_varyings_state *state,
                       nir_deref_instr *unpacked_deref,
                       nir_deref_instr *packed_deref,
                       const struct glsl_type *unpacked_type,
                       nir_def *value, unsigned writemask)
 {
    nir_variable *packed_var = nir_deref_instr_get_variable(packed_deref);

    const struct glsl_type *packed_type = glsl_without_array(packed_var->type);
    enum glsl_base_type packed_base_type = glsl_get_base_type(packed_type);
    enum glsl_base_type unpacked_base_type = glsl_get_base_type(unpacked_type);

    struct packing_store_values *store_state =
       calloc(1, sizeof(struct packing_store_values));

    if (unpacked_base_type != packed_base_type) {
       /* Since we only mix types in flat varyings, and we always store flat
        * varyings as type ivec4, we need only produce conversions from int to
        * (uint or float).
        */
       assert(packed_base_type == GLSL_TYPE_INT);

       switch (unpacked_base_type) {
       case GLSL_TYPE_UINT:
       case GLSL_TYPE_FLOAT:
          value = nir_mov(&state->b, value);
          break;
       case GLSL_TYPE_DOUBLE:
       case GLSL_TYPE_UINT64:
       case GLSL_TYPE_INT64:
          assert(glsl_get_vector_elements(unpacked_type) <= 2);
          if (glsl_get_vector_elements(unpacked_type) == 2) {
             assert(glsl_get_vector_elements(packed_type) == 4);

             unsigned swiz_xy[2] = {0, 1};
             writemask = 1 << (ffs(writemask) - 1);

             store_state->is_64bit = true;
             store_state->deref = unpacked_deref;
             store_state->values[0] =
                nir_pack_64_2x32(&state->b,
                                 nir_swizzle(&state->b, value, swiz_xy, 2));
             store_state->writemasks[0] = writemask;

             unsigned swiz_zw[2] = {2, 3};
             writemask = writemask << 1;

             store_state->deref = unpacked_deref;
             store_state->values[1] =
                nir_pack_64_2x32(&state->b,
                                 nir_swizzle(&state->b, value, swiz_zw, 2));
             store_state->writemasks[1] = writemask;

             return store_state;
          } else {
             value = nir_pack_64_2x32(&state->b, value);
          }
          break;
       case GLSL_TYPE_SAMPLER:
       case GLSL_TYPE_IMAGE:
          value = nir_pack_64_2x32(&state->b, value);
          break;
       default:
          assert(!"Unexpected type conversion while lowering varyings");
          break;
       }
    }

    store_state->deref = unpacked_deref;
    store_state->values[0] = value;
    store_state->writemasks[0] = writemask;

    return store_state;
 }

 static void
 create_store_deref(struct lower_packed_varyings_state *state,
                    nir_deref_instr *deref, nir_def *value,
                    unsigned writemask, bool is_64bit)
 {
    /* If dest and value have different number of components pack the srcs
     * into a vector.
     */
    const struct glsl_type *type = glsl_without_array(deref->type);
    unsigned comps = glsl_get_vector_elements(type);
    if (value->num_components != comps) {
       nir_def *srcs[4];

       unsigned comp = 0;
       for (unsigned i = 0; i < comps; i++) {
          if (writemask & (1 << i)) {
             if (is_64bit && state->mode == nir_var_shader_in)
                srcs[i] = value;
             else
                srcs[i] = nir_swizzle(&state->b, value, &comp, 1);
             comp++;
          } else {
             srcs[i] = nir_undef(&state->b, 1,
                                     glsl_type_is_64bit(type) ? 64 : 32);
          }
       }
       value = nir_vec(&state->b, srcs, comps);
    }

    nir_store_deref(&state->b, deref, value, writemask);
 }

 static unsigned
 lower_varying(struct lower_packed_varyings_state *state,
               nir_def *rhs_swizzle, unsigned writemask,
               const struct glsl_type *type, unsigned fine_location,
               nir_variable *unpacked_var, nir_deref_instr *unpacked_var_deref,
               const char *name, bool gs_input_toplevel, unsigned vertex_index);

 /**
  * Recursively pack or unpack a varying for which we need to iterate over its
  * constituent elements.
  * This takes care of both arrays and matrices.
  *
  * \param gs_input_toplevel should be set to true if we are lowering geometry
  * shader inputs, and we are currently lowering the whole input variable
  * (i.e. we are lowering the array whose index selects the vertex).
  *
  * \param vertex_index: if we are lowering geometry shader inputs, and the
  * level of the array that we are currently lowering is *not* the top level,
  * then this indicates which vertex we are currently lowering.  Otherwise it
  * is ignored.
  */
 static unsigned
 lower_arraylike(struct lower_packed_varyings_state *state,
                 nir_def *rhs_swizzle, unsigned writemask,
                 const struct glsl_type *type, unsigned fine_location,
                 nir_variable *unpacked_var, nir_deref_instr *unpacked_var_deref,
                 const char *name, bool gs_input_toplevel, unsigned vertex_index)
 {
    unsigned array_size = glsl_get_length(type);
    unsigned dmul = glsl_type_is_64bit(glsl_without_array(type)) ? 2 : 1;
    if (array_size * dmul + fine_location % 4 > 4) {
       fine_location = ALIGN_POT(fine_location, dmul);
    }

    type = glsl_get_array_element(type);
    for (unsigned i = 0; i < array_size; i++) {
       nir_load_const_instr *c_idx =
          nir_load_const_instr_create(state->b.shader, 1, 32);
       c_idx->value[0].u32 = i;
       nir_builder_instr_insert(&state->b, &c_idx->instr);

       nir_deref_instr *unpacked_array_deref =
          nir_build_deref_array(&state->b, unpacked_var_deref, &c_idx->def);

       if (gs_input_toplevel) {
          /* Geometry shader inputs are a special case.  Instead of storing
           * each element of the array at a different location, all elements
           * are at the same location, but with a different vertex index.
           */
          (void) lower_varying(state, rhs_swizzle, writemask, type, fine_location,
                               unpacked_var, unpacked_array_deref, name, false, i);
       } else {
          char *subscripted_name = name ?
             ralloc_asprintf(state->mem_ctx, "%s[%d]", name, i) : NULL;
          fine_location =
             lower_varying(state, rhs_swizzle, writemask, type, fine_location,
                           unpacked_var, unpacked_array_deref,
                           subscripted_name, false, vertex_index);
       }
    }

    return fine_location;
 }

 /**
  * Recursively pack or unpack the given varying (or portion of a varying) by
  * traversing all of its constituent vectors.
  *
  * \param fine_location is the location where the first constituent vector
  * should be packed--the word "fine" indicates that this location is expressed
  * in multiples of a float, rather than multiples of a vec4 as is used
  * elsewhere in Mesa.
  *
  * \param gs_input_toplevel should be set to true if we are lowering geometry
  * shader inputs, and we are currently lowering the whole input variable
  * (i.e. we are lowering the array whose index selects the vertex).
  *
  * \param vertex_index: if we are lowering geometry shader inputs, and the
  * level of the array that we are currently lowering is *not* the top level,
  * then this indicates which vertex we are currently lowering.  Otherwise it
  * is ignored.
  *
  * \return the location where the next constituent vector (after this one)
  * should be packed.
  */
 static unsigned
 lower_varying(struct lower_packed_varyings_state *state,
               nir_def *rhs_swizzle, unsigned writemask,
               const struct glsl_type *type, unsigned fine_location,
               nir_variable *unpacked_var, nir_deref_instr *unpacked_var_deref,
               const char *name, bool gs_input_toplevel, unsigned vertex_index)
 {
    unsigned dmul = glsl_type_is_64bit(type) ? 2 : 1;
    /* When gs_input_toplevel is set, we should be looking at a geometry shader
     * input array.
     */
    assert(!gs_input_toplevel || glsl_type_is_array(type));

    if (glsl_type_is_struct(type)) {
       unsigned struct_len = glsl_get_length(type);
       for (unsigned i = 0; i < struct_len; i++) {
          const char *field_name = glsl_get_struct_elem_name(type, i);
          char *deref_name = name ?
             ralloc_asprintf(state->mem_ctx, "%s.%s", name, field_name) :
             NULL;
          const struct glsl_type *field_type = glsl_get_struct_field(type, i);

          nir_deref_instr *unpacked_struct_deref =
             nir_build_deref_struct(&state->b, unpacked_var_deref, i);
          fine_location = lower_varying(state, rhs_swizzle, writemask, field_type,
                                        fine_location, unpacked_var,
                                        unpacked_struct_deref, deref_name,
                                        false, vertex_index);
       }

       return fine_location;
    } else if (glsl_type_is_array(type)) {
       /* Arrays are packed/unpacked by considering each array element in
        * sequence.
        */
       return lower_arraylike(state, rhs_swizzle, writemask, type, fine_location,
                              unpacked_var, unpacked_var_deref, name,
                              gs_input_toplevel, vertex_index);
    } else if (glsl_type_is_matrix(type)) {
       /* Matrices are packed/unpacked by considering each column vector in
        * sequence.
        */
       return lower_arraylike(state, rhs_swizzle, writemask, type, fine_location,
                              unpacked_var, unpacked_var_deref, name, false,
                              vertex_index);
    } else if (glsl_get_vector_elements(type) * dmul + fine_location % 4 > 4) {
       /* We don't have code to split up 64bit variable between two
        * varying slots, instead we add padding if necessary.
        */
       unsigned aligned_fine_location = ALIGN_POT(fine_location, dmul);
       if (aligned_fine_location != fine_location) {
          return lower_varying(state, rhs_swizzle, writemask, type,
                               aligned_fine_location, unpacked_var,
                               unpacked_var_deref, name, false, vertex_index);
       }

       /* This vector is going to be "double parked" across two varying slots,
        * so handle it as two separate assignments. For doubles, a dvec3/dvec4
        * can end up being spread over 3 slots. However the second splitting
        * will happen later, here we just always want to split into 2.
        */
       unsigned left_components, right_components;
       unsigned left_swizzle_values[4] = { 0, 0, 0, 0 };
       unsigned right_swizzle_values[4] = { 0, 0, 0, 0 };
       char left_swizzle_name[4] = { 0, 0, 0, 0 };
       char right_swizzle_name[4] = { 0, 0, 0, 0 };

       left_components = 4 - fine_location % 4;
       if (glsl_type_is_64bit(type)) {
          left_components /= 2;
          assert(left_components > 0);
       }
       right_components = glsl_get_vector_elements(type) - left_components;

       /* If set use previously set writemask to offset the following
        * swizzle/writemasks. This can happen when spliting a dvec, etc across
        * slots.
        */
       unsigned offset = 0;
       if (writemask) {
          for (unsigned i = 0; i < left_components; i++) {
             /* Keep going until we find the first component of the write */
             if (!(writemask & (1 << i))) {
                offset++;
             } else
                break;
          }
       }

       for (unsigned i = 0; i < left_components; i++) {
          left_swizzle_values[i] = i + offset;
          left_swizzle_name[i] = "xyzw"[i + offset];
       }
       for (unsigned i = 0; i < right_components; i++) {
          right_swizzle_values[i] = i + left_components + offset;
          right_swizzle_name[i] = "xyzw"[i + left_components + offset];
       }

       if (left_components) {
          char *left_name = name ?
             ralloc_asprintf(state->mem_ctx, "%s.%s", name, left_swizzle_name) :
             NULL;

          nir_def *left_swizzle = NULL;
          unsigned left_writemask = ~0u;
          if (state->mode == nir_var_shader_out) {
             nir_def *ssa_def = rhs_swizzle ?
                rhs_swizzle : nir_load_deref(&state->b, unpacked_var_deref);
             left_swizzle =
                nir_swizzle(&state->b, ssa_def,
                            left_swizzle_values, left_components);
          } else {
             left_writemask = ((1 << left_components) - 1) << offset;
          }

          const struct glsl_type *swiz_type =
             glsl_vector_type(glsl_get_base_type(type), left_components);
          fine_location = lower_varying(state, left_swizzle, left_writemask, swiz_type,
                                        fine_location, unpacked_var, unpacked_var_deref,
                                        left_name, false, vertex_index);
       } else {
          /* Top up the fine location to the next slot */
          fine_location++;
       }

       char *right_name = name ?
          ralloc_asprintf(state->mem_ctx, "%s.%s", name, right_swizzle_name) :
          NULL;

       nir_def *right_swizzle = NULL;
       unsigned right_writemask = ~0u;
       if (state->mode == nir_var_shader_out) {
         nir_def *ssa_def = rhs_swizzle ?
            rhs_swizzle : nir_load_deref(&state->b, unpacked_var_deref);
         right_swizzle =
            nir_swizzle(&state->b, ssa_def,
                        right_swizzle_values, right_components);
       } else {
          right_writemask = ((1 << right_components) - 1) << (left_components + offset);
       }

       const struct glsl_type *swiz_type =
          glsl_vector_type(glsl_get_base_type(type), right_components);
       return lower_varying(state, right_swizzle, right_writemask, swiz_type,
                            fine_location, unpacked_var, unpacked_var_deref,
                            right_name, false, vertex_index);
    } else {
       /* No special handling is necessary; (un)pack the old varying (now temp)
        * from/into the new packed varying.
        */
       unsigned components = glsl_get_vector_elements(type) * dmul;
       unsigned location = fine_location / 4;
       unsigned location_frac = fine_location % 4;

       assert(state->components[location - VARYING_SLOT_VAR0] >= components);
       nir_deref_instr *packed_deref =
          get_packed_varying_deref(state, location, unpacked_var, name,
                                   vertex_index);

       nir_variable *packed_var =
          state->packed_varyings[location - VARYING_SLOT_VAR0];
       if (unpacked_var->data.stream != 0) {
          assert(unpacked_var->data.stream < 4);
          for (unsigned i = 0; i < components; ++i) {
             packed_var->data.stream |=
                unpacked_var->data.stream << (2 * (location_frac + i));
          }
       }

       struct packing_store_values *store_value;
       if (state->mode == nir_var_shader_out) {
          unsigned writemask = ((1 << components) - 1) << location_frac;
          nir_def *value = rhs_swizzle ? rhs_swizzle :
             nir_load_deref(&state->b, unpacked_var_deref);

          store_value =
             bitwise_assign_pack(state, packed_deref, unpacked_var_deref, type,
                                 value, writemask);
       } else {
          unsigned swizzle_values[4] = { 0, 0, 0, 0 };
          for (unsigned i = 0; i < components; ++i) {
             swizzle_values[i] = i + location_frac;
          }

          nir_def *ssa_def = &packed_deref->def;
          ssa_def = nir_load_deref(&state->b, packed_deref);
          nir_def *swizzle =
             nir_swizzle(&state->b, ssa_def, swizzle_values, components);

          store_value = bitwise_assign_unpack(state, unpacked_var_deref,
                                              packed_deref, type, swizzle,
                                              writemask);
       }

       create_store_deref(state, store_value->deref, store_value->values[0],
                          store_value->writemasks[0], store_value->is_64bit);
       if (store_value->is_64bit) {
          create_store_deref(state, store_value->deref, store_value->values[1],
                             store_value->writemasks[1], store_value->is_64bit);
       }

       free(store_value);
       return fine_location + components;
    }
 }

 /* Recursively pack varying. */
 static void
 pack_output_var(struct lower_packed_varyings_state *state, nir_variable *var)
 {
    nir_deref_instr *unpacked_var_deref = nir_build_deref_var(&state->b, var);
    lower_varying(state, NULL, ~0u, var->type,
                  var->data.location * 4 + var->data.location_frac,
                  var, unpacked_var_deref, var->name,
                  state->gs_input_vertices != 0, 0);
 }

 static void
 lower_output_var(struct lower_packed_varyings_state *state, nir_variable *var)
 {
    if (var->data.mode != state->mode ||
        var->data.location < VARYING_SLOT_VAR0 ||
        !lower_packed_varying_needs_lowering(state->shader, var,
                                             state->xfb_enabled,
                                             state->disable_xfb_packing,
                                             state->disable_varying_packing))
       return;

       /* Skip any new packed varyings we just added */
    if (strncmp("packed:", var->name, 7) == 0)
       return;

    /* This lowering pass is only capable of packing floats and ints
     * together when their interpolation mode is "flat".  Treat integers as
     * being flat when the interpolation mode is none.
     */
    assert(var->data.interpolation == INTERP_MODE_FLAT ||
           var->data.interpolation == INTERP_MODE_NONE ||
           !glsl_contains_integer(var->type));

    if (state->prog->SeparateShader && state->ifc_exposed_to_query_api) {
       struct set *resource_set = _mesa_pointer_set_create(NULL);

       nir_add_packed_var_to_resource_list(state->consts, state->prog,
                                           resource_set, var,
                                           state->shader->info.stage,
                                           GL_PROGRAM_OUTPUT);

       _mesa_set_destroy(resource_set, NULL);
    }

    /* Change the old varying into an ordinary global. */
    var->data.mode = nir_var_shader_temp;

    nir_foreach_block(block, state->impl) {
       if (state->shader->info.stage != MESA_SHADER_GEOMETRY) {
          /* For shaders other than geometry, outputs need to be lowered before
           * each return statement and at the end of main()
           */
          if (nir_block_ends_in_return_or_halt(block)) {
             state->b.cursor = nir_before_instr(nir_block_last_instr(block));
             pack_output_var(state, var);
          } else if (block == nir_impl_last_block(state->impl)) {
             state->b.cursor = nir_after_block(block);
             pack_output_var(state, var);
          }
       } else {
         /* For geometry shaders, outputs need to be lowered before each call
          * to EmitVertex()
          */
          nir_foreach_instr_safe(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
                continue;

             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
             if (intrin->intrinsic != nir_intrinsic_emit_vertex)
                continue;

             state->b.cursor = nir_before_instr(instr);
             pack_output_var(state, var);
          }
       }
    }
 }

 static void
 lower_packed_outputs(struct lower_packed_varyings_state *state)
 {
    nir_foreach_shader_out_variable_safe(var, state->shader) {
       lower_output_var(state, var);
    }
 }

 static void
 lower_packed_inputs(struct lower_packed_varyings_state *state)
 {
    /* Shader inputs need to be lowered at the beginning of main() so set bulder
     * cursor to insert packing code at the start of the main function.
     */
    state->b.cursor = nir_before_impl(state->impl);

    /* insert new varyings, lower old ones to locals and add unpacking code a
     * the start of the shader.
     */
    nir_foreach_shader_in_variable_safe(var, state->shader) {
       if (var->data.mode != state->mode ||
           var->data.location < VARYING_SLOT_VAR0 ||
           !lower_packed_varying_needs_lowering(state->shader, var,
                                                state->xfb_enabled,
                                                state->disable_xfb_packing,
                                                state->disable_varying_packing))
          continue;

       /* Skip any new packed varyings we just added */
       if (strncmp("packed:", var->name, 7) == 0)
          continue;

       /* This lowering pass is only capable of packing floats and ints
        * together when their interpolation mode is "flat".  Treat integers as
        * being flat when the interpolation mode is none.
        */
       assert(var->data.interpolation == INTERP_MODE_FLAT ||
              var->data.interpolation == INTERP_MODE_NONE ||
              !glsl_contains_integer(var->type));

       /* Program interface needs to expose varyings in case of SSO. Add the
        * variable for program resource list before it gets modified and lost.
        */
       if (state->prog->SeparateShader && state->ifc_exposed_to_query_api) {
          struct set *resource_set = _mesa_pointer_set_create(NULL);

          nir_add_packed_var_to_resource_list(state->consts, state->prog,
                                              resource_set, var,
                                              state->shader->info.stage,
                                              GL_PROGRAM_INPUT);

          _mesa_set_destroy(resource_set, NULL);
       }

       /* Change the old varying into an ordinary global. */
       var->data.mode = nir_var_shader_temp;

       /* Recursively unpack varying. */
       nir_deref_instr *unpacked_var_deref = nir_build_deref_var(&state->b, var);
       lower_varying(state, NULL, ~0u, var->type,
                     var->data.location * 4 + var->data.location_frac,
                     var, unpacked_var_deref, var->name,
                     state->gs_input_vertices != 0, 0);
    }
 }

 void
 gl_nir_lower_packed_varyings(const struct gl_constants *consts,
                              struct gl_shader_program *prog,
                              void *mem_ctx, unsigned locations_used,
                              const uint8_t *components,
                              nir_variable_mode mode, unsigned gs_input_vertices,
                              struct gl_linked_shader *linked_shader,
                              bool disable_varying_packing,
                              bool disable_xfb_packing, bool xfb_enabled)
 {
    struct lower_packed_varyings_state state;
    nir_shader *shader = linked_shader->Program->nir;
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);

    assert(shader->info.stage != MESA_SHADER_COMPUTE);

    /* assert that functions have been inlined before packing is called */
    nir_foreach_function(f, shader) {
       assert(f->impl == impl);
    }

    state.b = nir_builder_create(impl);
    state.consts = consts;
    state.prog = prog;
    state.mem_ctx = mem_ctx;
    state.shader = shader;
    state.impl = impl;
    state.locations_used = locations_used;
    state.components = components;
    state.mode = mode;
    state.gs_input_vertices = gs_input_vertices;
    state.disable_varying_packing = disable_varying_packing;
    state.disable_xfb_packing = disable_xfb_packing;
    state.xfb_enabled = xfb_enabled;
    state.packed_varyings =
       (nir_variable **) rzalloc_array_size(mem_ctx, sizeof(nir_variable *),
                                            locations_used);

    /* Determine if the shader interface is exposed to api query */
    struct gl_linked_shader *linked_shaders[MESA_SHADER_STAGES];
    unsigned num_shaders = 0;
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i])
          linked_shaders[num_shaders++] = prog->_LinkedShaders[i];
    }

    if (mode == nir_var_shader_in) {
       state.ifc_exposed_to_query_api = linked_shaders[0] == linked_shader;
       lower_packed_inputs(&state);
    } else {
       state.ifc_exposed_to_query_api =
          linked_shaders[num_shaders - 1] == linked_shader;
       lower_packed_outputs(&state);
    }

    nir_lower_global_vars_to_local(shader);
    nir_fixup_deref_modes(shader);
 }