src/amd/common/nir/ac_nir_cull.c - third_party/mesa - Git at Google

 /*
  * Copyright 2019 Advanced Micro Devices, Inc.
  * Copyright 2021 Valve Corporation
  *
  * SPDX-License-Identifier: MIT
  */

 #include "ac_nir.h"
 #include "ac_nir_helpers.h"
 #include "nir_builder.h"

 /* This code is adapted from ac_llvm_cull.c, hence the copyright to AMD. */

 typedef struct
 {
    nir_def *w_reflection;
    nir_def *all_w_negative_or_zero_or_nan;
    nir_def *any_w_negative;
 } position_w_info;

 static void
 analyze_position_w(nir_builder *b, nir_def *pos[][4], unsigned num_vertices,
                    position_w_info *w_info)
 {
    w_info->all_w_negative_or_zero_or_nan = nir_imm_true(b);
    w_info->w_reflection = nir_imm_false(b);
    w_info->any_w_negative = nir_imm_false(b);

    for (unsigned i = 0; i < num_vertices; ++i) {
       nir_def *neg_w = nir_flt_imm(b, pos[i][3], 0.0f);
       nir_def *neg_or_zero_or_nan_w = nir_fgeu(b, nir_imm_float(b, 0.0f), pos[i][3]);

       w_info->w_reflection = nir_ixor(b, neg_w, w_info->w_reflection);
       w_info->any_w_negative = nir_ior(b, neg_w, w_info->any_w_negative);
       w_info->all_w_negative_or_zero_or_nan = nir_iand(b, neg_or_zero_or_nan_w, w_info->all_w_negative_or_zero_or_nan);
    }
 }

 static nir_def *
 cull_face_triangle(nir_builder *b, nir_def *pos[3][4], const position_w_info *w_info)
 {
    nir_def *det_t0 = nir_fsub(b, pos[2][0], pos[0][0]);
    nir_def *det_t1 = nir_fsub(b, pos[1][1], pos[0][1]);
    nir_def *det_t2 = nir_fsub(b, pos[0][0], pos[1][0]);
    nir_def *det_t3 = nir_fsub(b, pos[0][1], pos[2][1]);
    nir_def *det_p0 = nir_fmul(b, det_t0, det_t1);
    nir_def *det_p1 = nir_fmul(b, det_t2, det_t3);
    nir_def *det = nir_fsub(b, det_p0, det_p1);

    det = nir_bcsel(b, w_info->w_reflection, nir_fneg(b, det), det);

    nir_def *front_facing_ccw = nir_fgt_imm(b, det, 0.0f);
    nir_def *zero_area = nir_feq_imm(b, det, 0.0f);
    nir_def *ccw = nir_load_cull_ccw_amd(b);
    nir_def *front_facing = nir_ieq(b, front_facing_ccw, ccw);
    nir_def *cull_front = nir_load_cull_front_face_enabled_amd(b);
    nir_def *cull_back = nir_load_cull_back_face_enabled_amd(b);

    nir_def *face_culled = nir_bcsel(b, front_facing, cull_front, cull_back);
    face_culled = nir_ior(b, face_culled, zero_area);

    /* Don't reject NaN and +/-infinity, these are tricky.
     * Just trust fixed-function HW to handle these cases correctly.
     */
    return nir_iand(b, face_culled, nir_fisfinite(b, det));
 }

 static void
 calc_bbox_triangle(nir_builder *b, nir_def *pos[3][4], nir_def *bbox_min[2], nir_def *bbox_max[2])
 {
    for (unsigned chan = 0; chan < 2; ++chan) {
       bbox_min[chan] = nir_fmin(b, pos[0][chan], nir_fmin(b, pos[1][chan], pos[2][chan]));
       bbox_max[chan] = nir_fmax(b, pos[0][chan], nir_fmax(b, pos[1][chan], pos[2][chan]));
    }
 }

 static nir_def *
 cull_frustrum(nir_builder *b, nir_def *bbox_min[2], nir_def *bbox_max[2])
 {
    nir_def *prim_outside_view = nir_imm_false(b);

    for (unsigned chan = 0; chan < 2; ++chan) {
       prim_outside_view = nir_ior(b, prim_outside_view, nir_flt_imm(b, bbox_max[chan], -1.0f));
       prim_outside_view = nir_ior(b, prim_outside_view, nir_fgt_imm(b, bbox_min[chan], 1.0f));
    }

    return prim_outside_view;
 }

 static nir_def *
 cross(nir_builder *b, nir_def *p[2], nir_def *q[2])
 {
    nir_def *left = nir_fmul(b, p[0], q[1]);
    nir_def *right = nir_fmul(b, q[0], p[1]);
    return nir_fsub(b, left, right);
 }

 /* Return whether the distance between the point and the triangle is greater than the given
  * distance.
  */
 static nir_def *
 point_outside_triangle(nir_builder *b, nir_def *p[2], nir_def *pos[3][2], nir_def *distance)
 {
    nir_def **vtx_a = pos[0], **vtx_b = pos[1], **vtx_c = pos[2];
    nir_def *a_b[2] = { nir_fsub(b, vtx_b[0], vtx_a[0]), nir_fsub(b, vtx_b[1], vtx_a[1]) };
    nir_def *a_c[2] = { nir_fsub(b, vtx_c[0], vtx_a[0]), nir_fsub(b, vtx_c[1], vtx_a[1]) };
    nir_def *b_c[2] = { nir_fsub(b, vtx_c[0], vtx_b[0]), nir_fsub(b, vtx_c[1], vtx_b[1]) };
    nir_def *a_p[2] = { nir_fsub(b, p[0], vtx_a[0]), nir_fsub(b, p[1], vtx_a[1]) };
    nir_def *b_p[2] = { nir_fsub(b, p[0], vtx_b[0]), nir_fsub(b, p[1], vtx_b[1]) };

    /* Compute 2D cross products, which we need for computing distances from lines. */
    nir_def *crosses[3] = { cross(b, a_p, a_c), cross(b, a_b, a_p), cross(b, b_c, b_p) };

    /* These are distances from the 3 infinite lines going through triangle edges.
     *
     * A distance is positive if the point is on one side of the half space, and negative
     * if the point is on the other side of the half space. That's because the distance is
     * a normalized 2D cross product, which is always scalar and signed.
     */
    nir_def *line_distances[3] = {
       nir_fmul(b, crosses[0], nir_frsq(b, nir_fdot2(b, nir_vec(b, a_c, 2), nir_vec(b, a_c, 2)))),
       nir_fmul(b, crosses[1], nir_frsq(b, nir_fdot2(b, nir_vec(b, a_b, 2), nir_vec(b, a_b, 2)))),
       nir_fmul(b, crosses[2], nir_frsq(b, nir_fdot2(b, nir_vec(b, b_c, 2), nir_vec(b, b_c, 2)))),
    };

    nir_def *max_distance =
       nir_fmax(b, line_distances[0], nir_fmax(b, line_distances[1], line_distances[2]));
    nir_def *min_distance =
       nir_fmin(b, line_distances[0], nir_fmin(b, line_distances[1], line_distances[2]));

    /* If max_distance > distance && min_distance < -distance, the point is outside the triangle.
     *
     * Explanation:
     *
     * If the point it outside the triangle, 2 distances are positive and 1 is negative, or 2 distances
     * are negative and 1 is positive (depending on winding and where the point is). max_distance > distance
     * will pass because at least 1 distance is positive, and min_distance < -distance will pass because at
     * least 1 distance is negative.
     *
     * However, if the point is inside the triangle, either all distances are positive (min_distance < -distance
     * will fail) or all distances are negative (max_distance > distance will fail), depending on winding.
     *
     * Note that min/max_distance are not distances from the triangle, but they are distances from
     * the lines. This can falsely return that the distance between the point and the triangle is
     * less than than the given distance if 2 infinite lines are sticking out of 1 vertex, are
     * pointing in the direction of the point, and there is a very small angle between them.
     * Most of these cases should be eliminated by the rounding-based small prim culling.
     */
    return nir_iand(b, nir_flt(b, distance, max_distance),
                    nir_flt(b, min_distance, nir_fneg(b, distance)));
 }

 static nir_def *
 cull_small_primitive_triangle(nir_builder *b, bool use_point_tri_intersection,
                               nir_def *bbox_min[2], nir_def *bbox_max[2], nir_def *pos[3][4])
 {
    nir_def *vp = nir_load_cull_triangle_viewport_xy_scale_and_offset_amd(b);
    nir_def *small_prim_precision = nir_load_cull_small_triangle_precision_amd(b);
    nir_def *rejected = nir_imm_false(b);

    nir_def *bbox_pixel_min[2], *bbox_pixel_max[2], *vp_scale[2], *vp_translate[2];

    for (unsigned chan = 0; chan < 2; ++chan) {
       vp_scale[chan] = nir_channel(b, vp, chan);
       vp_translate[chan] = nir_channel(b, vp, 2 + chan);

       /* Convert the position to screen-space coordinates. */
       nir_def *min = nir_ffma(b, bbox_min[chan], vp_scale[chan], vp_translate[chan]);
       nir_def *max = nir_ffma(b, bbox_max[chan], vp_scale[chan], vp_translate[chan]);

       /* Scale the bounding box according to precision. */
       min = nir_fsub(b, min, small_prim_precision);
       max = nir_fadd(b, max, small_prim_precision);

       /* Determine if the bbox intersects the sample point, by checking if the min and max round to the same int. */
       bbox_pixel_min[chan] = nir_fround_even(b, min);
       bbox_pixel_max[chan] = nir_fround_even(b, max);

       nir_def *rounded_to_eq = nir_feq(b, bbox_pixel_min[chan], bbox_pixel_max[chan]);
       rejected = nir_ior(b, rejected, rounded_to_eq);
    }

    /* If the triangle hasn't been filtered out yet, try another way.
     * Only execute this code if this subgroup has culled at least 1 small triangle, which indicates
     * that there are probably more small triangles that could be culled.
     */
    if (use_point_tri_intersection) {
       nir_def *outside_center = NULL;
       nir_if *if_passed = nir_push_if(b, nir_inot(b, rejected));
       {
          /* Calculate rounded bounding box dimensions. */
          nir_def *bbox_pixel_w = nir_fsub(b, bbox_pixel_max[0], bbox_pixel_min[0]);
          nir_def *bbox_pixel_h = nir_fsub(b, bbox_pixel_max[1], bbox_pixel_min[1]);

          /* The largest bounding box (rounded to integer coordinates) that contains the triangle
           * that we accept has 1x1 pixel area and looks like this:
           *
           *    X         X         X
           *
           *         ┌─────────┐
           *         │         │
           *    X    │    X    │    X
           *         │         │
           *         └─────────┘
           *
           *    X         X         X
           *
           * However, the largest bounding box before the rounding that contains the triangle can be
           * this:
           *
           *    X         X         X
           *     ┌─────────────────┐
           *     │                 │
           *     │                 │
           *    X│        X        │X
           *     │                 │
           *     │                 │
           *     └─────────────────┘
           *    X         X         X
           *
           * which is the largest area that has 1 pixel center in the middle and 8 pixel centers
           * outside. Therefore, a 1x1 pixels-large rounded bounding box represents an area that's
           * slightly smaller than 2x2 pixels and has only a single pixel in the center. Thanks to
           * that and given that the triangle is always inside the bounding box, we only have to
           * compute a single point-triangle intersection.
           *
           * Check if the triangle's rounded bounding box is a single pixel, which means the triangle
           * can only potentially affect this pixel.
           *
           * 1.01 is used to prevent possible FP precision issues.
           */
          nir_def *w_1px = nir_flt_imm(b, bbox_pixel_w, 1.01);
          nir_def *h_1px = nir_flt_imm(b, bbox_pixel_h, 1.01);
          nir_def *fals = nir_imm_false(b);
          nir_if *if_tri_1px = nir_push_if(b, nir_iand(b, w_1px, h_1px));
          {
             /* The coordinates of the pixel center in screen space. */
             nir_def *pix_center[] = {
                nir_fadd_imm(b, bbox_pixel_min[0], 0.5),
                nir_fadd_imm(b, bbox_pixel_min[1], 0.5),
             };

             /* These are the X, Y coordinates of the 3 points of the triangle. */
             nir_def *screen_pos[3][2] = {{0}};

             /* Transform the coordinates to screen space. */
             for (unsigned vtx = 0; vtx < 3; ++vtx) {
                for (unsigned chan = 0; chan < 2; ++chan)
                   screen_pos[vtx][chan] = nir_ffma(b, pos[vtx][chan], vp_scale[chan], vp_translate[chan]);
             }

             /* small_prim_precision is the rasterization precision in X an Y axes, meaning it's the size of
              * one cell in the fixed-point grid that vertex positions are snapped to. When floating-point
              * coordinates are snapped (rounded) to fixed-point, vertex positions can be shifted by
              * +-small_prim_precision.
              *
              * We need a precision value that works in all directions. Compute the worst-case
              * omnidirectional precision, which is the length of the hypotenuse where
              * small_prim_precision is the length of the catheti.
              *
              * x = small_prim_precision
              * sqrt(x*x + x*x) = sqrt(x*x*2) = x * sqrt(2)
              */
             nir_def *precision_distance = nir_fmul_imm(b, small_prim_precision, sqrt(2));

             /* Check if the pixel center is outside the triangle. If it is, the triangle can be
              * safely removed.
              */
             outside_center = point_outside_triangle(b, pix_center, screen_pos, precision_distance);
          }
          nir_pop_if(b, if_tri_1px);

          outside_center = nir_if_phi(b, outside_center, fals);
       }
       nir_pop_if(b, if_passed);
       rejected = nir_if_phi(b, outside_center, rejected);
    }

    return rejected;
 }

 static void
 call_accept_func(nir_builder *b, nir_def *accepted, ac_nir_cull_accepted accept_func,
                  void *state)
 {
    if (!accept_func)
       return;

    nir_if *if_accepted = nir_push_if(b, accepted);
    if_accepted->control = nir_selection_control_divergent_always_taken;
    {
       accept_func(b, state);
    }
    nir_pop_if(b, if_accepted);
 }

 static nir_def *
 ac_nir_cull_triangle(nir_builder *b,
                      bool skip_viewport_state_culling,
                      bool use_point_tri_intersection,
                      nir_def *initially_accepted,
                      nir_def *pos[3][4],
                      position_w_info *w_info,
                      ac_nir_cull_accepted accept_func,
                      void *state)
 {
    nir_def *accepted = initially_accepted;
    accepted = nir_iand(b, accepted, nir_inot(b, w_info->all_w_negative_or_zero_or_nan));
    accepted = nir_iand(b, accepted, nir_inot(b, cull_face_triangle(b, pos, w_info)));

    nir_def *bbox_accepted = NULL;

    nir_if *if_accepted = nir_push_if(b, accepted);
    {
       nir_def *bbox_min[2] = {0}, *bbox_max[2] = {0};
       calc_bbox_triangle(b, pos, bbox_min, bbox_max);

       nir_def *prim_outside_view = cull_frustrum(b, bbox_min, bbox_max);
       nir_def *bbox_rejected = prim_outside_view;

       if (!skip_viewport_state_culling) {
          nir_if *if_cull_small_prims = nir_push_if(b, nir_load_cull_small_triangles_enabled_amd(b));
          {
             nir_def *small_prim_rejected = cull_small_primitive_triangle(b, use_point_tri_intersection,
                                                                          bbox_min, bbox_max, pos);
             bbox_rejected = nir_ior(b, bbox_rejected, small_prim_rejected);
          }
          nir_pop_if(b, if_cull_small_prims);

          bbox_rejected = nir_if_phi(b, bbox_rejected, prim_outside_view);
       }

       bbox_accepted = nir_ior(b, nir_inot(b, bbox_rejected), w_info->any_w_negative);
       call_accept_func(b, bbox_accepted, accept_func, state);
    }
    nir_pop_if(b, if_accepted);

    return nir_if_phi(b, bbox_accepted, accepted);
 }

 static void
 rotate_45degrees(nir_builder *b, nir_def *v[2])
 {
    /* Rotating a triangle by 45 degrees:
     *
     *    x2  =  x*cos(45) - y*sin(45)
     *    y2  =  x*sin(45) + y*cos(45)
     *
     * Since sin(45) == cos(45), we can write:
     *
     *    x2  =  x*cos(45) - y*cos(45)  =  (x - y) * cos(45)
     *    y2  =  x*cos(45) + y*cos(45)  =  (x + y) * cos(45)
     *
     * The width of each square (rotated diamond) is sqrt(0.5), so we have to scale it to 1
     * by multiplying by 1/sqrt(0.5) = sqrt(2) because we want round() to give us the position
     * of the closest center of the square (rotated diamond). After scaling, we get:
     *
     *    x2  =  (x - y) * cos(45) * sqrt(2)
     *    y2  =  (x + y) * cos(45) * sqrt(2)
     *
     * Since cos(45) * sqrt(2) = 1, we get:
     *
     *    x2  =  x - y
     *    y2  =  x + y
     */
    nir_def *result[2];
    result[0] = nir_fsub(b, v[0], v[1]);
    result[1] = nir_fadd(b, v[0], v[1]);

    memcpy(v, result, sizeof(result));
 }

 static void
 calc_bbox_line(nir_builder *b, nir_def *pos[3][4], nir_def *bbox_min[2], nir_def *bbox_max[2])
 {
    nir_def *clip_half_line_width = nir_load_clip_half_line_width_amd(b);

    for (unsigned chan = 0; chan < 2; ++chan) {
       bbox_min[chan] = nir_fmin(b, pos[0][chan], pos[1][chan]);
       bbox_max[chan] = nir_fmax(b, pos[0][chan], pos[1][chan]);

       nir_def *width = nir_channel(b, clip_half_line_width, chan);
       bbox_min[chan] = nir_fsub(b, bbox_min[chan], width);
       bbox_max[chan] = nir_fadd(b, bbox_max[chan], width);
    }
 }

 static nir_def *
 cull_small_primitive_line(nir_builder *b, nir_def *pos[3][4],
                           nir_def *bbox_min[2], nir_def *bbox_max[2],
                           nir_def *prim_is_small_else)
 {
    nir_def *prim_is_small = NULL;

    /* Small primitive filter - eliminate lines that are too small to affect a sample. */
    nir_if *if_cull_small_prims = nir_push_if(b, nir_load_cull_small_lines_enabled_amd(b));
    {
       /* This only works with lines without perpendicular end caps (lines with perpendicular
        * end caps are rasterized as quads and thus can't be culled as small prims in 99% of
        * cases because line_width >= 1).
        *
        * This takes advantage of the diamond exit rule, which says that every pixel
        * has a diamond inside it touching the pixel boundary and only if a line exits
        * the diamond, that pixel is filled. If a line enters the diamond or stays
        * outside the diamond, the pixel isn't filled.
        *
        * This algorithm is a little simpler than that. The space outside all diamonds also
        * has the same diamond shape, which we'll call corner diamonds.
        *
        * The idea is to cull all lines that are entirely inside a diamond, including
        * corner diamonds. If a line is entirely inside a diamond, it can be culled because
        * it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
        * because it doesn't enter any diamond and thus can't exit any diamond.
        *
        * The viewport is rotated by 45 degrees to turn diamonds into squares, and a bounding
        * box test is used to determine whether a line is entirely inside any square (diamond).
        *
        * The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
        * Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use
        * perpendicular end caps that enable quad rasterization for lines. Thus, this should
        * always use non-MSAA viewport transformation and non-MSAA small prim precision.
        *
        * A good test is piglit/lineloop because it draws 10k subpixel lines in a circle.
        * It should contain no holes if this matches hw behavior.
        */
       nir_def *v0[2], *v1[2];
       nir_def *vp = nir_load_cull_line_viewport_xy_scale_and_offset_amd(b);

       /* Get vertex positions in pixels. */
       for (unsigned chan = 0; chan < 2; chan++) {
          nir_def *vp_scale = nir_channel(b, vp, chan);
          nir_def *vp_translate = nir_channel(b, vp, 2 + chan);

          v0[chan] = nir_ffma(b, pos[0][chan], vp_scale, vp_translate);
          v1[chan] = nir_ffma(b, pos[1][chan], vp_scale, vp_translate);
       }

       /* Rotate the viewport by 45 degrees, so that diamonds become squares. */
       rotate_45degrees(b, v0);
       rotate_45degrees(b, v1);

       nir_def *small_prim_precision = nir_load_cull_small_line_precision_amd(b);

       nir_def *rounded_to_eq[2];
       for (unsigned chan = 0; chan < 2; chan++) {
          /* Compute the bounding box around both vertices. We do this because we must
           * enlarge the line area by the precision of the rasterizer.
           */
          nir_def *min = nir_fmin(b, v0[chan], v1[chan]);
          nir_def *max = nir_fmax(b, v0[chan], v1[chan]);

          /* Enlarge the bounding box by the precision of the rasterizer. */
          min = nir_fsub(b, min, small_prim_precision);
          max = nir_fadd(b, max, small_prim_precision);

          /* Round the bounding box corners. If both rounded corners are equal,
           * the bounding box is entirely inside a square (diamond).
           */
          min = nir_fround_even(b, min);
          max = nir_fround_even(b, max);

          rounded_to_eq[chan] = nir_feq(b, min, max);
       }

       prim_is_small = nir_iand(b, rounded_to_eq[0], rounded_to_eq[1]);
       prim_is_small = nir_ior(b, prim_is_small, prim_is_small_else);
    }
    nir_pop_if(b, if_cull_small_prims);

    return nir_if_phi(b, prim_is_small, prim_is_small_else);
 }

 static nir_def *
 ac_nir_cull_line(nir_builder *b,
                  bool skip_viewport_state_culling,
                  nir_def *initially_accepted,
                  nir_def *pos[3][4],
                  position_w_info *w_info,
                  ac_nir_cull_accepted accept_func,
                  void *state)
 {
    nir_def *accepted = initially_accepted;
    accepted = nir_iand(b, accepted, nir_inot(b, w_info->all_w_negative_or_zero_or_nan));

    if (skip_viewport_state_culling) {
       call_accept_func(b, accepted, accept_func, state);
       return accepted;
    }

    nir_def *bbox_accepted = NULL;

    nir_if *if_accepted = nir_push_if(b, accepted);
    {
       nir_def *bbox_min[2] = {0}, *bbox_max[2] = {0};
       calc_bbox_line(b, pos, bbox_min, bbox_max);

       /* Frustrum culling - eliminate lines that are fully outside the view. */
       nir_def *prim_outside_view = cull_frustrum(b, bbox_min, bbox_max);
       nir_def *prim_invisible =
          cull_small_primitive_line(b, pos, bbox_min, bbox_max, prim_outside_view);

       bbox_accepted = nir_ior(b, nir_inot(b, prim_invisible), w_info->any_w_negative);
       call_accept_func(b, bbox_accepted, accept_func, state);
    }
    nir_pop_if(b, if_accepted);

    return nir_if_phi(b, bbox_accepted, accepted);
 }

 nir_def *
 ac_nir_cull_primitive(nir_builder *b,
                       bool skip_viewport_state_culling,
                       bool use_point_tri_intersection,
                       nir_def *initially_accepted,
                       nir_def *pos[3][4],
                       unsigned num_vertices,
                       ac_nir_cull_accepted accept_func,
                       void *state)
 {
    position_w_info w_info = {0};
    analyze_position_w(b, pos, num_vertices, &w_info);

    if (num_vertices == 3) {
       return ac_nir_cull_triangle(b, skip_viewport_state_culling, use_point_tri_intersection,
                                   initially_accepted, pos, &w_info, accept_func, state);
    } else if (num_vertices == 2) {
       return ac_nir_cull_line(b, skip_viewport_state_culling, initially_accepted, pos, &w_info,
                               accept_func, state);
    } else {
       unreachable("point culling not implemented");
    }

    return NULL;
 }
	/*
	* Copyright 2019 Advanced Micro Devices, Inc.
	* Copyright 2021 Valve Corporation
	*
	* SPDX-License-Identifier: MIT
	*/

	#include "ac_nir.h"
	#include "ac_nir_helpers.h"
	#include "nir_builder.h"

	/* This code is adapted from ac_llvm_cull.c, hence the copyright to AMD. */

	typedef struct
	{
	nir_def *w_reflection;
	nir_def *all_w_negative_or_zero_or_nan;
	nir_def *any_w_negative;
	} position_w_info;

	static void
	analyze_position_w(nir_builder b, nir_def pos[][4], unsigned num_vertices,
	position_w_info *w_info)
	{
	w_info->all_w_negative_or_zero_or_nan = nir_imm_true(b);
	w_info->w_reflection = nir_imm_false(b);
	w_info->any_w_negative = nir_imm_false(b);

	for (unsigned i = 0; i < num_vertices; ++i) {
	nir_def *neg_w = nir_flt_imm(b, pos[i][3], 0.0f);
	nir_def *neg_or_zero_or_nan_w = nir_fgeu(b, nir_imm_float(b, 0.0f), pos[i][3]);

	w_info->w_reflection = nir_ixor(b, neg_w, w_info->w_reflection);
	w_info->any_w_negative = nir_ior(b, neg_w, w_info->any_w_negative);
	w_info->all_w_negative_or_zero_or_nan = nir_iand(b, neg_or_zero_or_nan_w, w_info->all_w_negative_or_zero_or_nan);
	}
	}

	static nir_def *
	cull_face_triangle(nir_builder b, nir_def pos[3][4], const position_w_info *w_info)
	{
	nir_def *det_t0 = nir_fsub(b, pos[2][0], pos[0][0]);
	nir_def *det_t1 = nir_fsub(b, pos[1][1], pos[0][1]);
	nir_def *det_t2 = nir_fsub(b, pos[0][0], pos[1][0]);
	nir_def *det_t3 = nir_fsub(b, pos[0][1], pos[2][1]);
	nir_def *det_p0 = nir_fmul(b, det_t0, det_t1);
	nir_def *det_p1 = nir_fmul(b, det_t2, det_t3);
	nir_def *det = nir_fsub(b, det_p0, det_p1);

	det = nir_bcsel(b, w_info->w_reflection, nir_fneg(b, det), det);

	nir_def *front_facing_ccw = nir_fgt_imm(b, det, 0.0f);
	nir_def *zero_area = nir_feq_imm(b, det, 0.0f);
	nir_def *ccw = nir_load_cull_ccw_amd(b);
	nir_def *front_facing = nir_ieq(b, front_facing_ccw, ccw);
	nir_def *cull_front = nir_load_cull_front_face_enabled_amd(b);
	nir_def *cull_back = nir_load_cull_back_face_enabled_amd(b);

	nir_def *face_culled = nir_bcsel(b, front_facing, cull_front, cull_back);
	face_culled = nir_ior(b, face_culled, zero_area);

	/* Don't reject NaN and +/-infinity, these are tricky.
	* Just trust fixed-function HW to handle these cases correctly.
	*/
	return nir_iand(b, face_culled, nir_fisfinite(b, det));
	}

	static void
	calc_bbox_triangle(nir_builder b, nir_def pos[3][4], nir_def bbox_min[2], nir_def bbox_max[2])
	{
	for (unsigned chan = 0; chan < 2; ++chan) {
	bbox_min[chan] = nir_fmin(b, pos[0][chan], nir_fmin(b, pos[1][chan], pos[2][chan]));
	bbox_max[chan] = nir_fmax(b, pos[0][chan], nir_fmax(b, pos[1][chan], pos[2][chan]));
	}
	}

	static nir_def *
	cull_frustrum(nir_builder b, nir_def bbox_min[2], nir_def *bbox_max[2])
	{
	nir_def *prim_outside_view = nir_imm_false(b);

	for (unsigned chan = 0; chan < 2; ++chan) {
	prim_outside_view = nir_ior(b, prim_outside_view, nir_flt_imm(b, bbox_max[chan], -1.0f));
	prim_outside_view = nir_ior(b, prim_outside_view, nir_fgt_imm(b, bbox_min[chan], 1.0f));
	}

	return prim_outside_view;
	}

	static nir_def *
	cross(nir_builder b, nir_def p[2], nir_def *q[2])
	{
	nir_def *left = nir_fmul(b, p[0], q[1]);
	nir_def *right = nir_fmul(b, q[0], p[1]);
	return nir_fsub(b, left, right);
	}

	/* Return whether the distance between the point and the triangle is greater than the given
	* distance.
	*/
	static nir_def *
	point_outside_triangle(nir_builder b, nir_def p[2], nir_def pos[3][2], nir_def distance)
	{
	nir_def vtx_a = pos[0], vtx_b = pos[1], **vtx_c = pos[2];
	nir_def *a_b[2] = { nir_fsub(b, vtx_b[0], vtx_a[0]), nir_fsub(b, vtx_b[1], vtx_a[1]) };
	nir_def *a_c[2] = { nir_fsub(b, vtx_c[0], vtx_a[0]), nir_fsub(b, vtx_c[1], vtx_a[1]) };
	nir_def *b_c[2] = { nir_fsub(b, vtx_c[0], vtx_b[0]), nir_fsub(b, vtx_c[1], vtx_b[1]) };
	nir_def *a_p[2] = { nir_fsub(b, p[0], vtx_a[0]), nir_fsub(b, p[1], vtx_a[1]) };
	nir_def *b_p[2] = { nir_fsub(b, p[0], vtx_b[0]), nir_fsub(b, p[1], vtx_b[1]) };

	/* Compute 2D cross products, which we need for computing distances from lines. */
	nir_def *crosses[3] = { cross(b, a_p, a_c), cross(b, a_b, a_p), cross(b, b_c, b_p) };

	/* These are distances from the 3 infinite lines going through triangle edges.
	*
	* A distance is positive if the point is on one side of the half space, and negative
	* if the point is on the other side of the half space. That's because the distance is
	* a normalized 2D cross product, which is always scalar and signed.
	*/
	nir_def *line_distances[3] = {
	nir_fmul(b, crosses[0], nir_frsq(b, nir_fdot2(b, nir_vec(b, a_c, 2), nir_vec(b, a_c, 2)))),
	nir_fmul(b, crosses[1], nir_frsq(b, nir_fdot2(b, nir_vec(b, a_b, 2), nir_vec(b, a_b, 2)))),
	nir_fmul(b, crosses[2], nir_frsq(b, nir_fdot2(b, nir_vec(b, b_c, 2), nir_vec(b, b_c, 2)))),
	};

	nir_def *max_distance =
	nir_fmax(b, line_distances[0], nir_fmax(b, line_distances[1], line_distances[2]));
	nir_def *min_distance =
	nir_fmin(b, line_distances[0], nir_fmin(b, line_distances[1], line_distances[2]));

	/* If max_distance > distance && min_distance < -distance, the point is outside the triangle.
	*
	* Explanation:
	*
	* If the point it outside the triangle, 2 distances are positive and 1 is negative, or 2 distances
	* are negative and 1 is positive (depending on winding and where the point is). max_distance > distance
	* will pass because at least 1 distance is positive, and min_distance < -distance will pass because at
	* least 1 distance is negative.
	*
	* However, if the point is inside the triangle, either all distances are positive (min_distance < -distance
	* will fail) or all distances are negative (max_distance > distance will fail), depending on winding.
	*
	* Note that min/max_distance are not distances from the triangle, but they are distances from
	* the lines. This can falsely return that the distance between the point and the triangle is
	* less than than the given distance if 2 infinite lines are sticking out of 1 vertex, are
	* pointing in the direction of the point, and there is a very small angle between them.
	* Most of these cases should be eliminated by the rounding-based small prim culling.
	*/
	return nir_iand(b, nir_flt(b, distance, max_distance),
	nir_flt(b, min_distance, nir_fneg(b, distance)));
	}

	static nir_def *
	cull_small_primitive_triangle(nir_builder *b, bool use_point_tri_intersection,
	nir_def bbox_min[2], nir_def bbox_max[2], nir_def *pos[3][4])
	{
	nir_def *vp = nir_load_cull_triangle_viewport_xy_scale_and_offset_amd(b);
	nir_def *small_prim_precision = nir_load_cull_small_triangle_precision_amd(b);
	nir_def *rejected = nir_imm_false(b);

	nir_def bbox_pixel_min[2], bbox_pixel_max[2], vp_scale[2], vp_translate[2];

	for (unsigned chan = 0; chan < 2; ++chan) {
	vp_scale[chan] = nir_channel(b, vp, chan);
	vp_translate[chan] = nir_channel(b, vp, 2 + chan);

	/* Convert the position to screen-space coordinates. */
	nir_def *min = nir_ffma(b, bbox_min[chan], vp_scale[chan], vp_translate[chan]);
	nir_def *max = nir_ffma(b, bbox_max[chan], vp_scale[chan], vp_translate[chan]);

	/* Scale the bounding box according to precision. */
	min = nir_fsub(b, min, small_prim_precision);
	max = nir_fadd(b, max, small_prim_precision);

	/* Determine if the bbox intersects the sample point, by checking if the min and max round to the same int. */
	bbox_pixel_min[chan] = nir_fround_even(b, min);
	bbox_pixel_max[chan] = nir_fround_even(b, max);

	nir_def *rounded_to_eq = nir_feq(b, bbox_pixel_min[chan], bbox_pixel_max[chan]);
	rejected = nir_ior(b, rejected, rounded_to_eq);
	}

	/* If the triangle hasn't been filtered out yet, try another way.
	* Only execute this code if this subgroup has culled at least 1 small triangle, which indicates
	* that there are probably more small triangles that could be culled.
	*/
	if (use_point_tri_intersection) {
	nir_def *outside_center = NULL;
	nir_if *if_passed = nir_push_if(b, nir_inot(b, rejected));
	{
	/* Calculate rounded bounding box dimensions. */
	nir_def *bbox_pixel_w = nir_fsub(b, bbox_pixel_max[0], bbox_pixel_min[0]);
	nir_def *bbox_pixel_h = nir_fsub(b, bbox_pixel_max[1], bbox_pixel_min[1]);

	/* The largest bounding box (rounded to integer coordinates) that contains the triangle
	* that we accept has 1x1 pixel area and looks like this:
	*
	* X X X
	*
	* ┌─────────┐
	* │ │
	* X │ X │ X
	* │ │
	* └─────────┘
	*
	* X X X
	*
	* However, the largest bounding box before the rounding that contains the triangle can be
	* this:
	*
	* X X X
	* ┌─────────────────┐
	* │ │
	* │ │
	* X│ X │X
	* │ │
	* │ │
	* └─────────────────┘
	* X X X
	*
	* which is the largest area that has 1 pixel center in the middle and 8 pixel centers
	* outside. Therefore, a 1x1 pixels-large rounded bounding box represents an area that's
	* slightly smaller than 2x2 pixels and has only a single pixel in the center. Thanks to
	* that and given that the triangle is always inside the bounding box, we only have to
	* compute a single point-triangle intersection.
	*
	* Check if the triangle's rounded bounding box is a single pixel, which means the triangle
	* can only potentially affect this pixel.
	*
	* 1.01 is used to prevent possible FP precision issues.
	*/
	nir_def *w_1px = nir_flt_imm(b, bbox_pixel_w, 1.01);
	nir_def *h_1px = nir_flt_imm(b, bbox_pixel_h, 1.01);
	nir_def *fals = nir_imm_false(b);
	nir_if *if_tri_1px = nir_push_if(b, nir_iand(b, w_1px, h_1px));
	{
	/* The coordinates of the pixel center in screen space. */
	nir_def *pix_center[] = {
	nir_fadd_imm(b, bbox_pixel_min[0], 0.5),
	nir_fadd_imm(b, bbox_pixel_min[1], 0.5),
	};

	/* These are the X, Y coordinates of the 3 points of the triangle. */
	nir_def *screen_pos[3][2] = {{0}};

	/* Transform the coordinates to screen space. */
	for (unsigned vtx = 0; vtx < 3; ++vtx) {
	for (unsigned chan = 0; chan < 2; ++chan)
	screen_pos[vtx][chan] = nir_ffma(b, pos[vtx][chan], vp_scale[chan], vp_translate[chan]);
	}

	/* small_prim_precision is the rasterization precision in X an Y axes, meaning it's the size of
	* one cell in the fixed-point grid that vertex positions are snapped to. When floating-point
	* coordinates are snapped (rounded) to fixed-point, vertex positions can be shifted by
	* +-small_prim_precision.
	*
	* We need a precision value that works in all directions. Compute the worst-case
	* omnidirectional precision, which is the length of the hypotenuse where
	* small_prim_precision is the length of the catheti.
	*
	* x = small_prim_precision
	* sqrt(xx + xx) = sqrt(xx2) = x * sqrt(2)
	*/
	nir_def *precision_distance = nir_fmul_imm(b, small_prim_precision, sqrt(2));

	/* Check if the pixel center is outside the triangle. If it is, the triangle can be
	* safely removed.
	*/
	outside_center = point_outside_triangle(b, pix_center, screen_pos, precision_distance);
	}
	nir_pop_if(b, if_tri_1px);

	outside_center = nir_if_phi(b, outside_center, fals);
	}
	nir_pop_if(b, if_passed);
	rejected = nir_if_phi(b, outside_center, rejected);
	}

	return rejected;
	}

	static void
	call_accept_func(nir_builder b, nir_def accepted, ac_nir_cull_accepted accept_func,
	void *state)
	{
	if (!accept_func)
	return;

	nir_if *if_accepted = nir_push_if(b, accepted);
	if_accepted->control = nir_selection_control_divergent_always_taken;
	{
	accept_func(b, state);
	}
	nir_pop_if(b, if_accepted);
	}

	static nir_def *
	ac_nir_cull_triangle(nir_builder *b,
	bool skip_viewport_state_culling,
	bool use_point_tri_intersection,
	nir_def *initially_accepted,
	nir_def *pos[3][4],
	position_w_info *w_info,
	ac_nir_cull_accepted accept_func,
	void *state)
	{
	nir_def *accepted = initially_accepted;
	accepted = nir_iand(b, accepted, nir_inot(b, w_info->all_w_negative_or_zero_or_nan));
	accepted = nir_iand(b, accepted, nir_inot(b, cull_face_triangle(b, pos, w_info)));

	nir_def *bbox_accepted = NULL;

	nir_if *if_accepted = nir_push_if(b, accepted);
	{
	nir_def bbox_min[2] = {0}, bbox_max[2] = {0};
	calc_bbox_triangle(b, pos, bbox_min, bbox_max);

	nir_def *prim_outside_view = cull_frustrum(b, bbox_min, bbox_max);
	nir_def *bbox_rejected = prim_outside_view;

	if (!skip_viewport_state_culling) {
	nir_if *if_cull_small_prims = nir_push_if(b, nir_load_cull_small_triangles_enabled_amd(b));
	{
	nir_def *small_prim_rejected = cull_small_primitive_triangle(b, use_point_tri_intersection,
	bbox_min, bbox_max, pos);
	bbox_rejected = nir_ior(b, bbox_rejected, small_prim_rejected);
	}
	nir_pop_if(b, if_cull_small_prims);

	bbox_rejected = nir_if_phi(b, bbox_rejected, prim_outside_view);
	}

	bbox_accepted = nir_ior(b, nir_inot(b, bbox_rejected), w_info->any_w_negative);
	call_accept_func(b, bbox_accepted, accept_func, state);
	}
	nir_pop_if(b, if_accepted);

	return nir_if_phi(b, bbox_accepted, accepted);
	}

	static void
	rotate_45degrees(nir_builder b, nir_def v[2])
	{
	/* Rotating a triangle by 45 degrees:
	*
	* x2 = xcos(45) - ysin(45)
	* y2 = xsin(45) + ycos(45)
	*
	* Since sin(45) == cos(45), we can write:
	*
	* x2 = xcos(45) - ycos(45) = (x - y) * cos(45)
	* y2 = xcos(45) + ycos(45) = (x + y) * cos(45)
	*
	* The width of each square (rotated diamond) is sqrt(0.5), so we have to scale it to 1
	* by multiplying by 1/sqrt(0.5) = sqrt(2) because we want round() to give us the position
	* of the closest center of the square (rotated diamond). After scaling, we get:
	*
	* x2 = (x - y) * cos(45) * sqrt(2)
	* y2 = (x + y) * cos(45) * sqrt(2)
	*
	* Since cos(45) * sqrt(2) = 1, we get:
	*
	* x2 = x - y
	* y2 = x + y
	*/
	nir_def *result[2];
	result[0] = nir_fsub(b, v[0], v[1]);
	result[1] = nir_fadd(b, v[0], v[1]);

	memcpy(v, result, sizeof(result));
	}

	static void
	calc_bbox_line(nir_builder b, nir_def pos[3][4], nir_def bbox_min[2], nir_def bbox_max[2])
	{
	nir_def *clip_half_line_width = nir_load_clip_half_line_width_amd(b);

	for (unsigned chan = 0; chan < 2; ++chan) {
	bbox_min[chan] = nir_fmin(b, pos[0][chan], pos[1][chan]);
	bbox_max[chan] = nir_fmax(b, pos[0][chan], pos[1][chan]);

	nir_def *width = nir_channel(b, clip_half_line_width, chan);
	bbox_min[chan] = nir_fsub(b, bbox_min[chan], width);
	bbox_max[chan] = nir_fadd(b, bbox_max[chan], width);
	}
	}

	static nir_def *
	cull_small_primitive_line(nir_builder b, nir_def pos[3][4],
	nir_def bbox_min[2], nir_def bbox_max[2],
	nir_def *prim_is_small_else)
	{
	nir_def *prim_is_small = NULL;

	/* Small primitive filter - eliminate lines that are too small to affect a sample. */
	nir_if *if_cull_small_prims = nir_push_if(b, nir_load_cull_small_lines_enabled_amd(b));
	{
	/* This only works with lines without perpendicular end caps (lines with perpendicular
	* end caps are rasterized as quads and thus can't be culled as small prims in 99% of
	* cases because line_width >= 1).
	*
	* This takes advantage of the diamond exit rule, which says that every pixel
	* has a diamond inside it touching the pixel boundary and only if a line exits
	* the diamond, that pixel is filled. If a line enters the diamond or stays
	* outside the diamond, the pixel isn't filled.
	*
	* This algorithm is a little simpler than that. The space outside all diamonds also
	* has the same diamond shape, which we'll call corner diamonds.
	*
	* The idea is to cull all lines that are entirely inside a diamond, including
	* corner diamonds. If a line is entirely inside a diamond, it can be culled because
	* it doesn't exit it. If a line is entirely inside a corner diamond, it can be culled
	* because it doesn't enter any diamond and thus can't exit any diamond.
	*
	* The viewport is rotated by 45 degrees to turn diamonds into squares, and a bounding
	* box test is used to determine whether a line is entirely inside any square (diamond).
	*
	* The line width doesn't matter. Wide lines only duplicate filled pixels in either X or
	* Y direction from the filled pixels. MSAA also doesn't matter. MSAA should ideally use
	* perpendicular end caps that enable quad rasterization for lines. Thus, this should
	* always use non-MSAA viewport transformation and non-MSAA small prim precision.
	*
	* A good test is piglit/lineloop because it draws 10k subpixel lines in a circle.
	* It should contain no holes if this matches hw behavior.
	*/
	nir_def v0[2], v1[2];
	nir_def *vp = nir_load_cull_line_viewport_xy_scale_and_offset_amd(b);

	/* Get vertex positions in pixels. */
	for (unsigned chan = 0; chan < 2; chan++) {
	nir_def *vp_scale = nir_channel(b, vp, chan);
	nir_def *vp_translate = nir_channel(b, vp, 2 + chan);

	v0[chan] = nir_ffma(b, pos[0][chan], vp_scale, vp_translate);
	v1[chan] = nir_ffma(b, pos[1][chan], vp_scale, vp_translate);
	}

	/* Rotate the viewport by 45 degrees, so that diamonds become squares. */
	rotate_45degrees(b, v0);
	rotate_45degrees(b, v1);

	nir_def *small_prim_precision = nir_load_cull_small_line_precision_amd(b);

	nir_def *rounded_to_eq[2];
	for (unsigned chan = 0; chan < 2; chan++) {
	/* Compute the bounding box around both vertices. We do this because we must
	* enlarge the line area by the precision of the rasterizer.
	*/
	nir_def *min = nir_fmin(b, v0[chan], v1[chan]);
	nir_def *max = nir_fmax(b, v0[chan], v1[chan]);

	/* Enlarge the bounding box by the precision of the rasterizer. */
	min = nir_fsub(b, min, small_prim_precision);
	max = nir_fadd(b, max, small_prim_precision);

	/* Round the bounding box corners. If both rounded corners are equal,
	* the bounding box is entirely inside a square (diamond).
	*/
	min = nir_fround_even(b, min);
	max = nir_fround_even(b, max);

	rounded_to_eq[chan] = nir_feq(b, min, max);
	}

	prim_is_small = nir_iand(b, rounded_to_eq[0], rounded_to_eq[1]);
	prim_is_small = nir_ior(b, prim_is_small, prim_is_small_else);
	}
	nir_pop_if(b, if_cull_small_prims);

	return nir_if_phi(b, prim_is_small, prim_is_small_else);
	}

	static nir_def *
	ac_nir_cull_line(nir_builder *b,
	bool skip_viewport_state_culling,
	nir_def *initially_accepted,
	nir_def *pos[3][4],
	position_w_info *w_info,
	ac_nir_cull_accepted accept_func,
	void *state)
	{
	nir_def *accepted = initially_accepted;
	accepted = nir_iand(b, accepted, nir_inot(b, w_info->all_w_negative_or_zero_or_nan));

	if (skip_viewport_state_culling) {
	call_accept_func(b, accepted, accept_func, state);
	return accepted;
	}

	nir_def *bbox_accepted = NULL;

	nir_if *if_accepted = nir_push_if(b, accepted);
	{
	nir_def bbox_min[2] = {0}, bbox_max[2] = {0};
	calc_bbox_line(b, pos, bbox_min, bbox_max);

	/* Frustrum culling - eliminate lines that are fully outside the view. */
	nir_def *prim_outside_view = cull_frustrum(b, bbox_min, bbox_max);
	nir_def *prim_invisible =
	cull_small_primitive_line(b, pos, bbox_min, bbox_max, prim_outside_view);

	bbox_accepted = nir_ior(b, nir_inot(b, prim_invisible), w_info->any_w_negative);
	call_accept_func(b, bbox_accepted, accept_func, state);
	}
	nir_pop_if(b, if_accepted);

	return nir_if_phi(b, bbox_accepted, accepted);
	}

	nir_def *
	ac_nir_cull_primitive(nir_builder *b,
	bool skip_viewport_state_culling,
	bool use_point_tri_intersection,
	nir_def *initially_accepted,
	nir_def *pos[3][4],
	unsigned num_vertices,
	ac_nir_cull_accepted accept_func,
	void *state)
	{
	position_w_info w_info = {0};
	analyze_position_w(b, pos, num_vertices, &w_info);

	if (num_vertices == 3) {
	return ac_nir_cull_triangle(b, skip_viewport_state_culling, use_point_tri_intersection,
	initially_accepted, pos, &w_info, accept_func, state);
	} else if (num_vertices == 2) {
	return ac_nir_cull_line(b, skip_viewport_state_culling, initially_accepted, pos, &w_info,
	accept_func, state);
	} else {
	unreachable("point culling not implemented");
	}

	return NULL;
	}