| /* |
| * Copyright (C) 2021 Valve Corporation |
| * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include "ir3_ra.h" |
| #include "util/rb_tree.h" |
| #include "util/u_math.h" |
| #include "ir3_shader.h" |
| |
| /* This file implements an SSA-based register allocator. Unlike other |
| * SSA-based allocators, it handles vector split/collect "smartly," meaning |
| * that multiple values may share the same register interval. From the |
| * perspective of the allocator itself, only the top-level intervals matter, |
| * and the allocator is only concerned with allocating top-level intervals, |
| * which may mean moving other top-level intervals around. Other intervals, |
| * like the destination of a split instruction or the source of a collect |
| * instruction, are "locked" to their parent interval. The details of this are |
| * mostly handled by ir3_merge_regs and ir3_reg_ctx. |
| * |
| * We currently don't do any backtracking, but we do use the merge sets as a |
| * form of affinity to try to avoid moves from phis/splits/collects. Each |
| * merge set is what a more "classic" graph-coloring or live-range based |
| * allocator would consider a single register, but here we use it as merely a |
| * hint, except when multiple overlapping values are live at the same time. |
| * Each merge set has a "preferred" register, and we try to honor that when |
| * allocating values in the merge set. |
| */ |
| |
| /* ir3_reg_ctx implementation. */ |
| |
| static int |
| ir3_reg_interval_cmp(const struct rb_node *node, const void *data) |
| { |
| unsigned reg = *(const unsigned *)data; |
| const struct ir3_reg_interval *interval = |
| ir3_rb_node_to_interval_const(node); |
| if (interval->reg->interval_start > reg) |
| return -1; |
| else if (interval->reg->interval_end <= reg) |
| return 1; |
| else |
| return 0; |
| } |
| |
| static struct ir3_reg_interval * |
| ir3_reg_interval_search(struct rb_tree *tree, unsigned offset) |
| { |
| struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp); |
| return node ? ir3_rb_node_to_interval(node) : NULL; |
| } |
| |
| static struct ir3_reg_interval * |
| ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset) |
| { |
| struct rb_node *node = |
| rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp); |
| return node ? ir3_rb_node_to_interval(node) : NULL; |
| } |
| |
| /* Get the interval covering the reg, or the closest to the right if it |
| * doesn't exist. |
| */ |
| static struct ir3_reg_interval * |
| ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset) |
| { |
| struct ir3_reg_interval *interval = |
| ir3_reg_interval_search_sloppy(tree, offset); |
| if (!interval) { |
| return NULL; |
| } else if (interval->reg->interval_end > offset) { |
| return interval; |
| } else { |
| /* There is no interval covering reg, and ra_file_search_sloppy() |
| * returned the closest range to the left, so the next interval to the |
| * right should be the closest to the right. |
| */ |
| return ir3_reg_interval_next_or_null(interval); |
| } |
| } |
| |
| static int |
| ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b) |
| { |
| const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a); |
| const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b); |
| return b->reg->interval_start - a->reg->interval_start; |
| } |
| |
| static void |
| interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree, |
| struct ir3_reg_interval *interval) |
| { |
| struct ir3_reg_interval *right = |
| ir3_reg_interval_search_right(tree, interval->reg->interval_start); |
| if (right && right->reg->interval_start < interval->reg->interval_end) { |
| /* We disallow trees where different members have different half-ness. |
| * This means that we can't treat bitcasts as copies like normal |
| * split/collect, so something like this would require an extra copy |
| * in mergedregs mode, and count as 4 half-units of register pressure |
| * instead of 2: |
| * |
| * f16vec2 foo = unpackFloat2x16(bar) |
| * ... = foo.x |
| * ... = bar |
| * |
| * However, relaxing this rule would open a huge can of worms. What |
| * happens when there's a vector of 16 things, and the fifth element |
| * has been bitcasted as a half-reg? Would that element alone have to |
| * be small enough to be used as a half-reg source? Let's keep that |
| * can of worms firmly shut for now. |
| */ |
| assert((interval->reg->flags & IR3_REG_HALF) == |
| (right->reg->flags & IR3_REG_HALF)); |
| |
| if (right->reg->interval_end <= interval->reg->interval_end && |
| right->reg->interval_start >= interval->reg->interval_start) { |
| /* Check if we're inserting something that's already inserted */ |
| assert(interval != right); |
| |
| /* "right" is contained in "interval" and must become a child of |
| * it. There may be further children too. |
| */ |
| for (struct ir3_reg_interval *next = ir3_reg_interval_next(right); |
| right && right->reg->interval_start < interval->reg->interval_end; |
| right = next, next = ir3_reg_interval_next_or_null(next)) { |
| /* "right" must be contained in "interval." */ |
| assert(right->reg->interval_end <= interval->reg->interval_end); |
| assert((interval->reg->flags & IR3_REG_HALF) == |
| (right->reg->flags & IR3_REG_HALF)); |
| if (!right->parent) |
| ctx->interval_delete(ctx, right); |
| right->parent = interval; |
| rb_tree_remove(tree, &right->node); |
| rb_tree_insert(&interval->children, &right->node, |
| ir3_reg_interval_insert_cmp); |
| } |
| } else { |
| /* "right" must contain "interval," since intervals must form a |
| * tree. |
| */ |
| assert(right->reg->interval_start <= interval->reg->interval_start); |
| interval->parent = right; |
| interval_insert(ctx, &right->children, interval); |
| return; |
| } |
| } |
| |
| if (!interval->parent) |
| ctx->interval_add(ctx, interval); |
| rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp); |
| interval->inserted = true; |
| } |
| |
| void |
| ir3_reg_interval_insert(struct ir3_reg_ctx *ctx, |
| struct ir3_reg_interval *interval) |
| { |
| rb_tree_init(&interval->children); |
| interval->parent = NULL; |
| interval_insert(ctx, &ctx->intervals, interval); |
| } |
| |
| /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */ |
| static void |
| ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx, |
| struct ir3_reg_interval *interval) |
| { |
| interval->parent = NULL; |
| interval_insert(ctx, &ctx->intervals, interval); |
| } |
| |
| void |
| ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, |
| struct ir3_reg_interval *interval) |
| { |
| if (interval->parent) { |
| rb_tree_remove(&interval->parent->children, &interval->node); |
| } else { |
| ctx->interval_delete(ctx, interval); |
| rb_tree_remove(&ctx->intervals, &interval->node); |
| } |
| |
| rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children, |
| node) { |
| rb_tree_remove(&interval->children, &child->node); |
| child->parent = interval->parent; |
| |
| if (interval->parent) { |
| rb_tree_insert(&child->parent->children, &child->node, |
| ir3_reg_interval_insert_cmp); |
| } else { |
| ctx->interval_readd(ctx, interval, child); |
| rb_tree_insert(&ctx->intervals, &child->node, |
| ir3_reg_interval_insert_cmp); |
| } |
| } |
| |
| interval->inserted = false; |
| } |
| |
| static void |
| _mark_free(struct ir3_reg_interval *interval) |
| { |
| interval->inserted = false; |
| rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) { |
| _mark_free(child); |
| } |
| } |
| |
| /* Remove an interval and all its children from the tree. */ |
| void |
| ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx, |
| struct ir3_reg_interval *interval) |
| { |
| assert(!interval->parent); |
| |
| ctx->interval_delete(ctx, interval); |
| rb_tree_remove(&ctx->intervals, &interval->node); |
| _mark_free(interval); |
| } |
| |
| /* Used when popping an interval to be shuffled around. Don't disturb children |
| * so that it can be later reinserted. |
| */ |
| static void |
| ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx, |
| struct ir3_reg_interval *interval) |
| { |
| assert(!interval->parent); |
| |
| ctx->interval_delete(ctx, interval); |
| rb_tree_remove(&ctx->intervals, &interval->node); |
| } |
| |
| static void |
| interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval, |
| unsigned indent) |
| { |
| for (unsigned i = 0; i < indent; i++) |
| mesa_log_stream_printf(stream, "\t"); |
| mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name, |
| interval->reg->interval_start); |
| |
| rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) { |
| interval_dump(stream, child, indent + 1); |
| } |
| |
| for (unsigned i = 0; i < indent; i++) |
| mesa_log_stream_printf(stream, "\t"); |
| mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name, |
| interval->reg->interval_end); |
| } |
| |
| void |
| ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval) |
| { |
| interval_dump(stream, interval, 0); |
| } |
| |
| /* These are the core datastructures used by the register allocator. First |
| * ra_interval and ra_file, which are used for intra-block tracking and use |
| * the ir3_reg_ctx infrastructure: |
| */ |
| |
| struct ra_interval { |
| struct ir3_reg_interval interval; |
| |
| struct rb_node physreg_node; |
| physreg_t physreg_start, physreg_end; |
| |
| /* True if this is a source of the current instruction which is entirely |
| * killed. This means we can allocate the dest over it, but we can't break |
| * it up. |
| */ |
| bool is_killed; |
| |
| /* True if this interval cannot be moved from its position. This is only |
| * used for precolored inputs to ensure that other inputs don't get |
| * allocated on top of them. |
| */ |
| bool frozen; |
| }; |
| |
| struct ra_file { |
| struct ir3_reg_ctx reg_ctx; |
| |
| BITSET_DECLARE(available, RA_MAX_FILE_SIZE); |
| BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE); |
| |
| struct rb_tree physreg_intervals; |
| |
| unsigned size; |
| unsigned start; |
| }; |
| |
| /* State for inter-block tracking. When we split a live range to make space |
| * for a vector, we may need to insert fixup code when a block has multiple |
| * predecessors that have moved the same live value to different registers. |
| * This keeps track of state required to do that. |
| */ |
| |
| struct ra_block_state { |
| /* Map of defining ir3_register -> physreg it was allocated to at the end |
| * of the block. |
| */ |
| struct hash_table *renames; |
| |
| /* For loops, we need to process a block before all its predecessors have |
| * been processed. In particular, we need to pick registers for values |
| * without knowing if all the predecessors have been renamed. This keeps |
| * track of the registers we chose so that when we visit the back-edge we |
| * can move them appropriately. If all predecessors have been visited |
| * before this block is visited then we don't need to fill this out. This |
| * is a map from ir3_register -> physreg. |
| */ |
| struct hash_table *entry_regs; |
| |
| /* True if the block has been visited and "renames" is complete. |
| */ |
| bool visited; |
| }; |
| |
| struct ra_parallel_copy { |
| struct ra_interval *interval; |
| physreg_t src; |
| }; |
| |
| /* The main context: */ |
| |
| struct ra_ctx { |
| /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom |
| * half of this file too. |
| */ |
| struct ra_file full; |
| |
| /* hr0.x - hr63.w, only used without merged-regs. */ |
| struct ra_file half; |
| |
| /* Shared regs. */ |
| struct ra_file shared; |
| |
| struct ir3_liveness *live; |
| |
| struct ir3_block *block; |
| |
| const struct ir3_compiler *compiler; |
| gl_shader_stage stage; |
| |
| /* Pending moves of top-level intervals that will be emitted once we're |
| * finished: |
| */ |
| DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies); |
| |
| struct ra_interval *intervals; |
| struct ra_block_state *blocks; |
| |
| bool merged_regs; |
| }; |
| |
| #define foreach_interval(interval, file) \ |
| rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \ |
| physreg_node) |
| #define foreach_interval_rev(interval, file) \ |
| rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \ |
| physreg_node) |
| #define foreach_interval_safe(interval, file) \ |
| rb_tree_foreach_safe (struct ra_interval, interval, \ |
| &(file)->physreg_intervals, physreg_node) |
| #define foreach_interval_rev_safe(interval, file) \ |
| rb_tree_foreach_rev_safe(struct ra_interval, interval, \ |
| &(file)->physreg_intervals, physreg_node) |
| |
| static struct ra_interval * |
| rb_node_to_interval(struct rb_node *node) |
| { |
| return rb_node_data(struct ra_interval, node, physreg_node); |
| } |
| |
| static const struct ra_interval * |
| rb_node_to_interval_const(const struct rb_node *node) |
| { |
| return rb_node_data(struct ra_interval, node, physreg_node); |
| } |
| |
| static struct ra_interval * |
| ra_interval_next(struct ra_interval *interval) |
| { |
| struct rb_node *next = rb_node_next(&interval->physreg_node); |
| return next ? rb_node_to_interval(next) : NULL; |
| } |
| |
| static struct ra_interval * |
| ra_interval_next_or_null(struct ra_interval *interval) |
| { |
| return interval ? ra_interval_next(interval) : NULL; |
| } |
| |
| static int |
| ra_interval_cmp(const struct rb_node *node, const void *data) |
| { |
| physreg_t reg = *(const physreg_t *)data; |
| const struct ra_interval *interval = rb_node_to_interval_const(node); |
| if (interval->physreg_start > reg) |
| return -1; |
| else if (interval->physreg_end <= reg) |
| return 1; |
| else |
| return 0; |
| } |
| |
| static struct ra_interval * |
| ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg) |
| { |
| struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp); |
| return node ? rb_node_to_interval(node) : NULL; |
| } |
| |
| /* Get the interval covering the reg, or the closest to the right if it |
| * doesn't exist. |
| */ |
| static struct ra_interval * |
| ra_interval_search_right(struct rb_tree *tree, physreg_t reg) |
| { |
| struct ra_interval *interval = ra_interval_search_sloppy(tree, reg); |
| if (!interval) { |
| return NULL; |
| } else if (interval->physreg_end > reg) { |
| return interval; |
| } else { |
| /* There is no interval covering reg, and ra_file_search_sloppy() |
| * returned the closest range to the left, so the next interval to the |
| * right should be the closest to the right. |
| */ |
| return ra_interval_next_or_null(interval); |
| } |
| } |
| |
| static struct ra_interval * |
| ra_file_search_right(struct ra_file *file, physreg_t reg) |
| { |
| return ra_interval_search_right(&file->physreg_intervals, reg); |
| } |
| |
| static int |
| ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b) |
| { |
| const struct ra_interval *a = rb_node_to_interval_const(_a); |
| const struct ra_interval *b = rb_node_to_interval_const(_b); |
| return b->physreg_start - a->physreg_start; |
| } |
| |
| static struct ra_interval * |
| ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval) |
| { |
| return rb_node_data(struct ra_interval, interval, interval); |
| } |
| |
| static struct ra_file * |
| ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx) |
| { |
| return rb_node_data(struct ra_file, ctx, reg_ctx); |
| } |
| |
| static void |
| interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval) |
| { |
| struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval); |
| struct ra_file *file = ir3_reg_ctx_to_file(ctx); |
| |
| /* We can assume in this case that physreg_start/physreg_end is already |
| * initialized. |
| */ |
| for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { |
| BITSET_CLEAR(file->available, i); |
| BITSET_CLEAR(file->available_to_evict, i); |
| } |
| |
| rb_tree_insert(&file->physreg_intervals, &interval->physreg_node, |
| ra_interval_insert_cmp); |
| } |
| |
| static void |
| interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval) |
| { |
| struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval); |
| struct ra_file *file = ir3_reg_ctx_to_file(ctx); |
| |
| for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { |
| BITSET_SET(file->available, i); |
| BITSET_SET(file->available_to_evict, i); |
| } |
| |
| rb_tree_remove(&file->physreg_intervals, &interval->physreg_node); |
| } |
| |
| static void |
| interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent, |
| struct ir3_reg_interval *_child) |
| { |
| struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent); |
| struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child); |
| |
| child->physreg_start = |
| parent->physreg_start + (child->interval.reg->interval_start - |
| parent->interval.reg->interval_start); |
| child->physreg_end = |
| child->physreg_start + |
| (child->interval.reg->interval_end - child->interval.reg->interval_start); |
| |
| interval_add(ctx, _child); |
| } |
| |
| static void |
| ra_file_init(struct ra_file *file) |
| { |
| for (unsigned i = 0; i < file->size; i++) { |
| BITSET_SET(file->available, i); |
| BITSET_SET(file->available_to_evict, i); |
| } |
| |
| rb_tree_init(&file->reg_ctx.intervals); |
| rb_tree_init(&file->physreg_intervals); |
| |
| file->reg_ctx.interval_add = interval_add; |
| file->reg_ctx.interval_delete = interval_delete; |
| file->reg_ctx.interval_readd = interval_readd; |
| } |
| |
| static void |
| ra_file_insert(struct ra_file *file, struct ra_interval *interval) |
| { |
| assert(interval->physreg_start < interval->physreg_end); |
| assert(interval->physreg_end <= file->size); |
| if (interval->interval.reg->flags & IR3_REG_HALF) |
| assert(interval->physreg_end <= RA_HALF_SIZE); |
| |
| ir3_reg_interval_insert(&file->reg_ctx, &interval->interval); |
| } |
| |
| static void |
| ra_file_remove(struct ra_file *file, struct ra_interval *interval) |
| { |
| ir3_reg_interval_remove(&file->reg_ctx, &interval->interval); |
| } |
| |
| static void |
| ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval) |
| { |
| assert(!interval->interval.parent); |
| |
| for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { |
| BITSET_SET(file->available, i); |
| } |
| |
| interval->is_killed = true; |
| } |
| |
| static void |
| ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval) |
| { |
| assert(!interval->interval.parent); |
| |
| for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) { |
| BITSET_CLEAR(file->available, i); |
| } |
| |
| interval->is_killed = false; |
| } |
| |
| static physreg_t |
| ra_interval_get_physreg(const struct ra_interval *interval) |
| { |
| unsigned child_start = interval->interval.reg->interval_start; |
| |
| while (interval->interval.parent) { |
| interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); |
| } |
| |
| return interval->physreg_start + |
| (child_start - interval->interval.reg->interval_start); |
| } |
| |
| static unsigned |
| ra_interval_get_num(const struct ra_interval *interval) |
| { |
| return ra_physreg_to_num(ra_interval_get_physreg(interval), |
| interval->interval.reg->flags); |
| } |
| |
| static void |
| ra_interval_init(struct ra_interval *interval, struct ir3_register *reg) |
| { |
| ir3_reg_interval_init(&interval->interval, reg); |
| interval->is_killed = false; |
| interval->frozen = false; |
| } |
| |
| static void |
| ra_interval_dump(struct log_stream *stream, struct ra_interval *interval) |
| { |
| mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start); |
| |
| ir3_reg_interval_dump(stream, &interval->interval); |
| } |
| |
| static void |
| ra_file_dump(struct log_stream *stream, struct ra_file *file) |
| { |
| rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, |
| physreg_node) { |
| ra_interval_dump(stream, interval); |
| } |
| |
| unsigned start, end; |
| mesa_log_stream_printf(stream, "available:\n"); |
| BITSET_FOREACH_RANGE (start, end, file->available, file->size) { |
| mesa_log_stream_printf(stream, "%u-%u ", start, end); |
| } |
| mesa_log_stream_printf(stream, "\n"); |
| |
| mesa_log_stream_printf(stream, "available to evict:\n"); |
| BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) { |
| mesa_log_stream_printf(stream, "%u-%u ", start, end); |
| } |
| mesa_log_stream_printf(stream, "\n"); |
| mesa_log_stream_printf(stream, "start: %u\n", file->start); |
| } |
| |
| static void |
| ra_ctx_dump(struct ra_ctx *ctx) |
| { |
| struct log_stream *stream = mesa_log_streami(); |
| mesa_log_stream_printf(stream, "full:\n"); |
| ra_file_dump(stream, &ctx->full); |
| mesa_log_stream_printf(stream, "half:\n"); |
| ra_file_dump(stream, &ctx->half); |
| mesa_log_stream_printf(stream, "shared:"); |
| ra_file_dump(stream, &ctx->shared); |
| mesa_log_stream_destroy(stream); |
| } |
| |
| static unsigned |
| reg_file_size(struct ra_file *file, struct ir3_register *reg) |
| { |
| /* Half-regs can only take up the first half of the combined regfile */ |
| if (reg->flags & IR3_REG_HALF) |
| return MIN2(file->size, RA_HALF_SIZE); |
| else |
| return file->size; |
| } |
| |
| /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple |
| * top-level intervals at once. Pop multiple intervals, then push them back in |
| * any order. |
| */ |
| |
| struct ra_removed_interval { |
| struct ra_interval *interval; |
| unsigned size; |
| }; |
| |
| static struct ra_removed_interval |
| ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file, |
| struct ra_interval *interval) |
| { |
| assert(!interval->interval.parent); |
| |
| /* Check if we've already moved this reg before */ |
| unsigned pcopy_index; |
| for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count; |
| pcopy_index++) { |
| if (ctx->parallel_copies[pcopy_index].interval == interval) |
| break; |
| } |
| |
| if (pcopy_index == ctx->parallel_copies_count) { |
| array_insert(ctx, ctx->parallel_copies, |
| (struct ra_parallel_copy){ |
| .interval = interval, |
| .src = interval->physreg_start, |
| }); |
| } |
| |
| ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval); |
| |
| return (struct ra_removed_interval){ |
| .interval = interval, |
| .size = interval->physreg_end - interval->physreg_start, |
| }; |
| } |
| |
| static void |
| ra_push_interval(struct ra_ctx *ctx, struct ra_file *file, |
| const struct ra_removed_interval *removed, physreg_t dst) |
| { |
| struct ra_interval *interval = removed->interval; |
| |
| interval->physreg_start = dst; |
| interval->physreg_end = dst + removed->size; |
| |
| assert(interval->physreg_end <= file->size); |
| if (interval->interval.reg->flags & IR3_REG_HALF) |
| assert(interval->physreg_end <= RA_HALF_SIZE); |
| |
| ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval); |
| } |
| |
| /* Pick up the interval and place it at "dst". */ |
| static void |
| ra_move_interval(struct ra_ctx *ctx, struct ra_file *file, |
| struct ra_interval *interval, physreg_t dst) |
| { |
| struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval); |
| ra_push_interval(ctx, file, &temp, dst); |
| } |
| |
| static struct ra_file * |
| ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg) |
| { |
| if (reg->flags & IR3_REG_SHARED) |
| return &ctx->shared; |
| else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF)) |
| return &ctx->full; |
| else |
| return &ctx->half; |
| } |
| |
| |
| /* Returns true if the proposed spot for "dst" or a killed source overlaps a |
| * destination that's been allocated. |
| */ |
| static bool |
| check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file, |
| struct ir3_register *dst, physreg_t start, |
| physreg_t end) |
| { |
| struct ir3_instruction *instr = dst->instr; |
| |
| ra_foreach_dst (other_dst, instr) { |
| /* We assume only destinations before the current one have been allocated. |
| */ |
| if (other_dst == dst) |
| break; |
| |
| if (ra_get_file(ctx, other_dst) != file) |
| continue; |
| |
| struct ra_interval *other_interval = &ctx->intervals[other_dst->name]; |
| assert(!other_interval->interval.parent); |
| physreg_t other_start = other_interval->physreg_start; |
| physreg_t other_end = other_interval->physreg_end; |
| |
| if (other_end > start && end > other_start) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* True if the destination is "early-clobber," meaning that it cannot be |
| * allocated over killed sources. Some destinations always require it, but it |
| * also is implicitly true for tied destinations whose source is live-through. |
| * If the source is killed, then we skip allocating a register for the |
| * destination altogether so we don't need to worry about that case here. |
| */ |
| static bool |
| is_early_clobber(struct ir3_register *reg) |
| { |
| return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied; |
| } |
| |
| static bool |
| get_reg_specified(struct ra_ctx *ctx, struct ra_file *file, |
| struct ir3_register *reg, physreg_t physreg, bool is_source) |
| { |
| for (unsigned i = 0; i < reg_size(reg); i++) { |
| if (!BITSET_TEST(is_early_clobber(reg) || is_source ? |
| file->available_to_evict : file->available, |
| physreg + i)) |
| return false; |
| } |
| |
| if (!is_source && |
| check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg))) |
| return false; |
| |
| return true; |
| } |
| |
| /* Try to evict any registers conflicting with the proposed spot "physreg" for |
| * "reg". That is, move them to other places so that we can allocate "physreg" |
| * here. |
| */ |
| |
| static bool |
| try_evict_regs(struct ra_ctx *ctx, struct ra_file *file, |
| struct ir3_register *reg, physreg_t physreg, |
| unsigned *_eviction_count, bool is_source, bool speculative) |
| { |
| BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE); |
| memcpy(available_to_evict, file->available_to_evict, |
| sizeof(available_to_evict)); |
| |
| BITSET_DECLARE(available, RA_MAX_FILE_SIZE); |
| memcpy(available, file->available, sizeof(available)); |
| |
| for (unsigned i = 0; i < reg_size(reg); i++) { |
| BITSET_CLEAR(available_to_evict, physreg + i); |
| BITSET_CLEAR(available, physreg + i); |
| } |
| |
| unsigned eviction_count = 0; |
| /* Iterate over each range conflicting with physreg */ |
| for (struct ra_interval *conflicting = ra_file_search_right(file, physreg), |
| *next = ra_interval_next_or_null(conflicting); |
| conflicting != NULL && |
| conflicting->physreg_start < physreg + reg_size(reg); |
| conflicting = next, next = ra_interval_next_or_null(next)) { |
| if (!is_early_clobber(reg) && !is_source && conflicting->is_killed) |
| continue; |
| |
| if (conflicting->frozen) { |
| assert(speculative); |
| return false; |
| } |
| |
| unsigned conflicting_file_size = |
| reg_file_size(file, conflicting->interval.reg); |
| unsigned avail_start, avail_end; |
| bool evicted = false; |
| BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict, |
| conflicting_file_size) { |
| unsigned size = avail_end - avail_start; |
| |
| /* non-half registers must be aligned */ |
| if (!(conflicting->interval.reg->flags & IR3_REG_HALF) && |
| avail_start % 2 == 1) { |
| avail_start++; |
| size--; |
| } |
| |
| unsigned conflicting_size = |
| conflicting->physreg_end - conflicting->physreg_start; |
| if (size >= conflicting_size && |
| !check_dst_overlap(ctx, file, reg, avail_start, avail_start + |
| conflicting_size)) { |
| for (unsigned i = 0; |
| i < conflicting->physreg_end - conflicting->physreg_start; i++) |
| BITSET_CLEAR(available_to_evict, avail_start + i); |
| eviction_count += |
| conflicting->physreg_end - conflicting->physreg_start; |
| if (!speculative) |
| ra_move_interval(ctx, file, conflicting, avail_start); |
| evicted = true; |
| break; |
| } |
| } |
| |
| if (evicted) |
| continue; |
| |
| /* If we couldn't evict this range, we may be able to swap it with a |
| * killed range to acheive the same effect. |
| */ |
| foreach_interval (killed, file) { |
| if (!killed->is_killed) |
| continue; |
| |
| if (killed->physreg_end - killed->physreg_start != |
| conflicting->physreg_end - conflicting->physreg_start) |
| continue; |
| |
| if (killed->physreg_end > conflicting_file_size || |
| conflicting->physreg_end > reg_file_size(file, killed->interval.reg)) |
| continue; |
| |
| /* We can't swap the killed range if it partially/fully overlaps the |
| * space we're trying to allocate or (in speculative mode) if it's |
| * already been swapped and will overlap when we actually evict. |
| */ |
| bool killed_available = true; |
| for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) { |
| if (!BITSET_TEST(available, i)) { |
| killed_available = false; |
| break; |
| } |
| } |
| |
| if (!killed_available) |
| continue; |
| |
| if (check_dst_overlap(ctx, file, reg, killed->physreg_start, |
| killed->physreg_end)) |
| continue; |
| |
| /* Check for alignment if one is a full reg */ |
| if ((!(killed->interval.reg->flags & IR3_REG_HALF) || |
| !(conflicting->interval.reg->flags & IR3_REG_HALF)) && |
| (killed->physreg_start % 2 != 0 || |
| conflicting->physreg_start % 2 != 0)) |
| continue; |
| |
| for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) { |
| BITSET_CLEAR(available, i); |
| } |
| /* Because this will generate swaps instead of moves, multiply the |
| * cost by 2. |
| */ |
| eviction_count += (killed->physreg_end - killed->physreg_start) * 2; |
| if (!speculative) { |
| physreg_t killed_start = killed->physreg_start, |
| conflicting_start = conflicting->physreg_start; |
| struct ra_removed_interval killed_removed = |
| ra_pop_interval(ctx, file, killed); |
| struct ra_removed_interval conflicting_removed = |
| ra_pop_interval(ctx, file, conflicting); |
| ra_push_interval(ctx, file, &killed_removed, conflicting_start); |
| ra_push_interval(ctx, file, &conflicting_removed, killed_start); |
| } |
| |
| evicted = true; |
| break; |
| } |
| |
| if (!evicted) |
| return false; |
| } |
| |
| *_eviction_count = eviction_count; |
| return true; |
| } |
| |
| static int |
| removed_interval_cmp(const void *_i1, const void *_i2) |
| { |
| const struct ra_removed_interval *i1 = _i1; |
| const struct ra_removed_interval *i2 = _i2; |
| |
| /* We sort the registers as follows: |
| * |
| * |------------------------------------------------------------------------------------------| |
| * | | | | | | | |
| * | Half | Half early-clobber | Half | Full | Full early-clobber | Full | |
| * | live-through | destination | killed | killed | destination | live-through | |
| * | | | | | | | |
| * |------------------------------------------------------------------------------------------| |
| * | | |
| * | Destination | |
| * | | |
| * |-----------------| |
| * |
| * Half-registers have to be first so that they stay in the low half of |
| * the register file. Then half and full killed must stay together so that |
| * there's a contiguous range where we can put the register. With this |
| * structure we should be able to accomodate any collection of intervals |
| * such that the total number of half components is within the half limit |
| * and the combined components are within the full limit. |
| */ |
| |
| unsigned i1_align = reg_elem_size(i1->interval->interval.reg); |
| unsigned i2_align = reg_elem_size(i2->interval->interval.reg); |
| if (i1_align > i2_align) |
| return 1; |
| if (i1_align < i2_align) |
| return -1; |
| |
| if (i1_align == 1) { |
| if (i2->interval->is_killed) |
| return -1; |
| if (i1->interval->is_killed) |
| return 1; |
| } else { |
| if (i2->interval->is_killed) |
| return 1; |
| if (i1->interval->is_killed) |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| static int |
| dsts_cmp(const void *_i1, const void *_i2) |
| { |
| struct ir3_register *i1 = *(struct ir3_register *const *) _i1; |
| struct ir3_register *i2 = *(struct ir3_register *const *) _i2; |
| |
| /* Treat tied destinations as-if they are live-through sources, and normal |
| * destinations as killed sources. |
| */ |
| unsigned i1_align = reg_elem_size(i1); |
| unsigned i2_align = reg_elem_size(i2); |
| if (i1_align > i2_align) |
| return 1; |
| if (i1_align < i2_align) |
| return -1; |
| |
| if (i1_align == 1) { |
| if (!is_early_clobber(i2)) |
| return -1; |
| if (!is_early_clobber(i1)) |
| return 1; |
| } else { |
| if (!is_early_clobber(i2)) |
| return 1; |
| if (!is_early_clobber(i1)) |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| /* "Compress" all the live intervals so that there is enough space for the |
| * destination register. As there can be gaps when a more-aligned interval |
| * follows a less-aligned interval, this also sorts them to remove such |
| * "padding", which may be required when space is very tight. This isn't |
| * amazing, but should be used only as a last resort in case the register file |
| * is almost full and badly fragmented. |
| * |
| * Return the physreg to use. |
| */ |
| static physreg_t |
| compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, |
| struct ir3_register *reg) |
| { |
| unsigned align = reg_elem_size(reg); |
| DECLARE_ARRAY(struct ra_removed_interval, intervals); |
| intervals_count = intervals_sz = 0; |
| intervals = NULL; |
| |
| DECLARE_ARRAY(struct ir3_register *, dsts); |
| dsts_count = dsts_sz = 0; |
| dsts = NULL; |
| array_insert(ctx, dsts, reg); |
| bool dst_inserted[reg->instr->dsts_count]; |
| |
| unsigned dst_size = reg->tied ? 0 : reg_size(reg); |
| unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0; |
| unsigned half_dst_size = 0, ec_half_dst_size = 0; |
| if (align == 1) { |
| half_dst_size = dst_size; |
| ec_half_dst_size = ec_dst_size; |
| } |
| |
| unsigned removed_size = 0, removed_half_size = 0; |
| unsigned removed_killed_size = 0, removed_killed_half_size = 0; |
| unsigned file_size = |
| align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size; |
| physreg_t start_reg = 0; |
| |
| foreach_interval_rev_safe (interval, file) { |
| /* We'll check if we can compact the intervals starting here. */ |
| physreg_t candidate_start = interval->physreg_end; |
| |
| /* Check if there are any other destinations we need to compact. */ |
| ra_foreach_dst_n (other_dst, n, reg->instr) { |
| if (other_dst == reg) |
| break; |
| if (ra_get_file(ctx, other_dst) != file) |
| continue; |
| if (dst_inserted[n]) |
| continue; |
| |
| struct ra_interval *other_interval = &ctx->intervals[other_dst->name]; |
| /* if the destination partially overlaps this interval, we need to |
| * extend candidate_start to the end. |
| */ |
| if (other_interval->physreg_start < candidate_start) { |
| candidate_start = MAX2(candidate_start, |
| other_interval->physreg_end); |
| continue; |
| } |
| |
| dst_inserted[n] = true; |
| |
| /* dst intervals with a tied killed source are considered attached to |
| * that source. Don't actually insert them. This means we have to |
| * update them below if their tied source moves. |
| */ |
| if (other_dst->tied) { |
| struct ra_interval *tied_interval = |
| &ctx->intervals[other_dst->tied->def->name]; |
| if (tied_interval->is_killed) |
| continue; |
| } |
| |
| d("popping destination %u physreg %u\n", |
| other_interval->interval.reg->name, |
| other_interval->physreg_start); |
| |
| array_insert(ctx, dsts, other_dst); |
| unsigned interval_size = reg_size(other_dst); |
| if (is_early_clobber(other_dst)) { |
| ec_dst_size += interval_size; |
| if (other_interval->interval.reg->flags & IR3_REG_HALF) |
| ec_half_dst_size += interval_size; |
| } else { |
| dst_size += interval_size; |
| if (other_interval->interval.reg->flags & IR3_REG_HALF) |
| half_dst_size += interval_size; |
| } |
| } |
| |
| /* Check if we can sort the intervals *after* this one and have enough |
| * space leftover to accomodate all intervals, keeping in mind that killed |
| * sources overlap non-tied destinations. Also check that we have enough |
| * space leftover for half-registers, if we're inserting a half-register |
| * (otherwise we only shift any half-registers down so they should be |
| * safe). |
| */ |
| if (candidate_start + removed_size + ec_dst_size + |
| MAX2(removed_killed_size, dst_size) <= file->size && |
| (align != 1 || |
| candidate_start + removed_half_size + ec_half_dst_size + |
| MAX2(removed_killed_half_size, half_dst_size) <= file_size)) { |
| start_reg = candidate_start; |
| break; |
| } |
| |
| /* We assume that all frozen intervals are at the start and that we |
| * can avoid popping them. |
| */ |
| assert(!interval->frozen); |
| |
| /* Killed sources are different because they go at the end and can |
| * overlap the register we're trying to add. |
| */ |
| unsigned interval_size = interval->physreg_end - interval->physreg_start; |
| if (interval->is_killed) { |
| removed_killed_size += interval_size; |
| if (interval->interval.reg->flags & IR3_REG_HALF) |
| removed_killed_half_size += interval_size; |
| } else { |
| removed_size += interval_size; |
| if (interval->interval.reg->flags & IR3_REG_HALF) |
| removed_half_size += interval_size; |
| } |
| |
| /* Now that we've done the accounting, pop this off */ |
| d("popping interval %u physreg %u%s\n", interval->interval.reg->name, |
| interval->physreg_start, interval->is_killed ? ", killed" : ""); |
| array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval)); |
| } |
| |
| /* TODO: In addition to skipping registers at the beginning that are |
| * well-packed, we should try to skip registers at the end. |
| */ |
| |
| qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp); |
| qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp); |
| |
| physreg_t live_reg = start_reg; |
| physreg_t dst_reg = (physreg_t)~0; |
| physreg_t ret_reg = (physreg_t)~0; |
| unsigned dst_index = 0; |
| unsigned live_index = 0; |
| |
| /* We have two lists of intervals to process, live intervals and destination |
| * intervals. Process them in the order of the disgram in insert_cmp(). |
| */ |
| while (live_index < intervals_count || dst_index < dsts_count) { |
| bool process_dst; |
| if (live_index == intervals_count) { |
| process_dst = true; |
| } else if (dst_index == dsts_count) { |
| process_dst = false; |
| } else { |
| struct ir3_register *dst = dsts[dst_index]; |
| struct ra_interval *live_interval = intervals[live_index].interval; |
| |
| bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF; |
| bool live_killed = live_interval->is_killed; |
| bool dst_half = dst->flags & IR3_REG_HALF; |
| bool dst_early_clobber = is_early_clobber(dst); |
| |
| if (live_half && !live_killed) { |
| /* far-left of diagram. */ |
| process_dst = false; |
| } else if (dst_half && dst_early_clobber) { |
| /* mid-left of diagram. */ |
| process_dst = true; |
| } else if (!dst_early_clobber) { |
| /* bottom of disagram. */ |
| process_dst = true; |
| } else if (live_killed) { |
| /* middle of diagram. */ |
| process_dst = false; |
| } else if (!dst_half && dst_early_clobber) { |
| /* mid-right of diagram. */ |
| process_dst = true; |
| } else { |
| /* far right of diagram. */ |
| assert(!live_killed && !live_half); |
| process_dst = false; |
| } |
| } |
| |
| struct ir3_register *cur_reg = |
| process_dst ? dsts[dst_index] : |
| intervals[live_index].interval->interval.reg; |
| |
| physreg_t physreg; |
| if (process_dst && !is_early_clobber(cur_reg)) { |
| if (dst_reg == (physreg_t)~0) |
| dst_reg = live_reg; |
| physreg = dst_reg; |
| } else { |
| physreg = live_reg; |
| struct ra_interval *live_interval = intervals[live_index].interval; |
| bool live_killed = live_interval->is_killed; |
| /* If this is live-through and we've processed the destinations, we |
| * need to make sure we take into account any overlapping destinations. |
| */ |
| if (!live_killed && dst_reg != (physreg_t)~0) |
| physreg = MAX2(physreg, dst_reg); |
| } |
| |
| if (!(cur_reg->flags & IR3_REG_HALF)) |
| physreg = ALIGN(physreg, 2); |
| |
| d("pushing reg %u physreg %u\n", cur_reg->name, physreg); |
| |
| unsigned interval_size = reg_size(cur_reg); |
| if (physreg + interval_size > |
| reg_file_size(file, cur_reg)) { |
| d("ran out of room for interval %u!\n", |
| cur_reg->name); |
| unreachable("reg pressure calculation was wrong!"); |
| return 0; |
| } |
| |
| if (process_dst) { |
| if (cur_reg == reg) { |
| ret_reg = physreg; |
| } else { |
| struct ra_interval *interval = &ctx->intervals[cur_reg->name]; |
| interval->physreg_start = physreg; |
| interval->physreg_end = physreg + interval_size; |
| } |
| dst_index++; |
| } else { |
| ra_push_interval(ctx, file, &intervals[live_index], physreg); |
| live_index++; |
| } |
| |
| physreg += interval_size; |
| |
| if (process_dst && !is_early_clobber(cur_reg)) { |
| dst_reg = physreg; |
| } else { |
| live_reg = physreg; |
| } |
| } |
| |
| /* If we shuffled around a tied source that is killed, we may have to update |
| * its corresponding destination since we didn't insert it above. |
| */ |
| ra_foreach_dst (dst, reg->instr) { |
| if (dst == reg) |
| break; |
| |
| struct ir3_register *tied = dst->tied; |
| if (!tied) |
| continue; |
| |
| struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; |
| if (!tied_interval->is_killed) |
| continue; |
| |
| struct ra_interval *dst_interval = &ctx->intervals[dst->name]; |
| unsigned dst_size = reg_size(dst); |
| dst_interval->physreg_start = ra_interval_get_physreg(tied_interval); |
| dst_interval->physreg_end = dst_interval->physreg_start + dst_size; |
| } |
| |
| return ret_reg; |
| } |
| |
| static void |
| update_affinity(struct ra_file *file, struct ir3_register *reg, |
| physreg_t physreg) |
| { |
| if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0) |
| return; |
| |
| if (physreg < reg->merge_set_offset) |
| return; |
| |
| if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size) |
| return; |
| |
| reg->merge_set->preferred_reg = physreg - reg->merge_set_offset; |
| } |
| |
| /* Try to find free space for a register without shuffling anything. This uses |
| * a round-robin algorithm to reduce false dependencies. |
| */ |
| static physreg_t |
| find_best_gap(struct ra_ctx *ctx, struct ra_file *file, |
| struct ir3_register *dst, unsigned file_size, unsigned size, |
| unsigned align) |
| { |
| /* This can happen if we create a very large merge set. Just bail out in that |
| * case. |
| */ |
| if (size > file_size) |
| return (physreg_t) ~0; |
| |
| BITSET_WORD *available = |
| is_early_clobber(dst) ? file->available_to_evict : file->available; |
| |
| unsigned start = ALIGN(file->start, align) % (file_size - size + align); |
| unsigned candidate = start; |
| do { |
| bool is_available = true; |
| for (unsigned i = 0; i < size; i++) { |
| if (!BITSET_TEST(available, candidate + i)) { |
| is_available = false; |
| break; |
| } |
| } |
| |
| if (is_available) { |
| is_available = |
| !check_dst_overlap(ctx, file, dst, candidate, candidate + size); |
| } |
| |
| if (is_available) { |
| file->start = (candidate + size) % file_size; |
| return candidate; |
| } |
| |
| candidate += align; |
| if (candidate + size > file_size) |
| candidate = 0; |
| } while (candidate != start); |
| |
| return (physreg_t)~0; |
| } |
| |
| /* This is the main entrypoint for picking a register. Pick a free register |
| * for "reg", shuffling around sources if necessary. In the normal case where |
| * "is_source" is false, this register can overlap with killed sources |
| * (intervals with "is_killed == true"). If "is_source" is true, then |
| * is_killed is ignored and the register returned must not overlap with killed |
| * sources. This must be used for tied registers, because we're actually |
| * allocating the destination and the tied source at the same time. |
| */ |
| |
| static physreg_t |
| get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg) |
| { |
| unsigned file_size = reg_file_size(file, reg); |
| if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) { |
| physreg_t preferred_reg = |
| reg->merge_set->preferred_reg + reg->merge_set_offset; |
| if (preferred_reg + reg_size(reg) <= file_size && |
| preferred_reg % reg_elem_size(reg) == 0 && |
| get_reg_specified(ctx, file, reg, preferred_reg, false)) |
| return preferred_reg; |
| } |
| |
| /* If this register is a subset of a merge set which we have not picked a |
| * register for, first try to allocate enough space for the entire merge |
| * set. |
| */ |
| unsigned size = reg_size(reg); |
| if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 && |
| size < reg->merge_set->size) { |
| physreg_t best_reg = find_best_gap(ctx, file, reg, file_size, |
| reg->merge_set->size, |
| reg->merge_set->alignment); |
| if (best_reg != (physreg_t)~0u) { |
| best_reg += reg->merge_set_offset; |
| return best_reg; |
| } |
| } |
| |
| /* For ALU and SFU instructions, if the src reg is avail to pick, use it. |
| * Because this doesn't introduce unnecessary dependencies, and it |
| * potentially avoids needing (ss) syncs for write after read hazards for |
| * SFU instructions: |
| */ |
| if (is_sfu(reg->instr) || is_alu(reg->instr)) { |
| for (unsigned i = 0; i < reg->instr->srcs_count; i++) { |
| struct ir3_register *src = reg->instr->srcs[i]; |
| if (!ra_reg_is_src(src)) |
| continue; |
| if (ra_get_file(ctx, src) == file && reg_size(src) >= size) { |
| struct ra_interval *src_interval = &ctx->intervals[src->def->name]; |
| physreg_t src_physreg = ra_interval_get_physreg(src_interval); |
| if (src_physreg % reg_elem_size(reg) == 0 && |
| src_physreg + size <= file_size && |
| get_reg_specified(ctx, file, reg, src_physreg, false)) |
| return src_physreg; |
| } |
| } |
| } |
| |
| physreg_t best_reg = |
| find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg)); |
| if (best_reg != (physreg_t)~0u) { |
| return best_reg; |
| } |
| |
| /* Ok, we couldn't find anything that fits. Here is where we have to start |
| * moving things around to make stuff fit. First try solely evicting |
| * registers in the way. |
| */ |
| unsigned best_eviction_count = ~0; |
| for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) { |
| unsigned eviction_count; |
| if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) { |
| if (eviction_count < best_eviction_count) { |
| best_eviction_count = eviction_count; |
| best_reg = i; |
| } |
| } |
| } |
| |
| if (best_eviction_count != ~0) { |
| ASSERTED bool result = try_evict_regs( |
| ctx, file, reg, best_reg, &best_eviction_count, false, false); |
| assert(result); |
| return best_reg; |
| } |
| |
| /* Use the dumb fallback only if try_evict_regs() fails. */ |
| return compress_regs_left(ctx, file, reg); |
| } |
| |
| static void |
| assign_reg(struct ir3_instruction *instr, struct ir3_register *reg, |
| unsigned num) |
| { |
| if (reg->flags & IR3_REG_ARRAY) { |
| reg->array.base = num; |
| if (reg->flags & IR3_REG_RELATIV) |
| reg->array.offset += num; |
| else |
| reg->num = num + reg->array.offset; |
| } else { |
| reg->num = num; |
| } |
| } |
| |
| static void |
| mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src) |
| { |
| struct ra_interval *interval = &ctx->intervals[src->def->name]; |
| |
| if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed || |
| interval->interval.parent || |
| !rb_tree_is_empty(&interval->interval.children)) |
| return; |
| |
| ra_file_mark_killed(ra_get_file(ctx, src), interval); |
| } |
| |
| static void |
| insert_dst(struct ra_ctx *ctx, struct ir3_register *dst) |
| { |
| struct ra_file *file = ra_get_file(ctx, dst); |
| struct ra_interval *interval = &ctx->intervals[dst->name]; |
| |
| d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval)); |
| |
| if (!(dst->flags & IR3_REG_UNUSED)) |
| ra_file_insert(file, interval); |
| |
| assign_reg(dst->instr, dst, ra_interval_get_num(interval)); |
| } |
| |
| static void |
| allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst, |
| physreg_t physreg) |
| { |
| struct ra_file *file = ra_get_file(ctx, dst); |
| struct ra_interval *interval = &ctx->intervals[dst->name]; |
| update_affinity(file, dst, physreg); |
| |
| ra_interval_init(interval, dst); |
| interval->physreg_start = physreg; |
| interval->physreg_end = physreg + reg_size(dst); |
| } |
| |
| /* If a tied destination interferes with its source register, we have to insert |
| * a copy beforehand to copy the source to the destination. Because we are using |
| * the parallel_copies array and not creating a separate copy, this copy will |
| * happen in parallel with any shuffling around of the tied source, so we have |
| * to copy the source *as it exists before it is shuffled around*. We do this by |
| * inserting the copy early, before any other copies are inserted. We don't |
| * actually know the destination of the copy, but that's ok because the |
| * dst_interval will be filled out later. |
| */ |
| static void |
| insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst) |
| { |
| struct ir3_register *tied = dst->tied; |
| |
| if (!tied) |
| return; |
| |
| struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; |
| struct ra_interval *dst_interval = &ctx->intervals[dst->name]; |
| |
| if (tied_interval->is_killed) |
| return; |
| |
| physreg_t tied_physreg = ra_interval_get_physreg(tied_interval); |
| |
| array_insert(ctx, ctx->parallel_copies, |
| (struct ra_parallel_copy){ |
| .interval = dst_interval, |
| .src = tied_physreg, |
| }); |
| } |
| |
| static void |
| allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst) |
| { |
| struct ra_file *file = ra_get_file(ctx, dst); |
| |
| struct ir3_register *tied = dst->tied; |
| if (tied) { |
| struct ra_interval *tied_interval = &ctx->intervals[tied->def->name]; |
| if (tied_interval->is_killed) { |
| /* The easy case: the source is killed, so we can just reuse it |
| * for the destination. |
| */ |
| allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval)); |
| return; |
| } |
| } |
| |
| /* All the hard work is done by get_reg here. */ |
| physreg_t physreg = get_reg(ctx, file, dst); |
| |
| allocate_dst_fixed(ctx, dst, physreg); |
| } |
| |
| static void |
| assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr, |
| struct ir3_register *src) |
| { |
| struct ra_interval *interval = &ctx->intervals[src->def->name]; |
| struct ra_file *file = ra_get_file(ctx, src); |
| |
| struct ir3_register *tied = src->tied; |
| physreg_t physreg; |
| if (tied) { |
| struct ra_interval *tied_interval = &ctx->intervals[tied->name]; |
| physreg = ra_interval_get_physreg(tied_interval); |
| } else { |
| physreg = ra_interval_get_physreg(interval); |
| } |
| |
| assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags)); |
| |
| if (src->flags & IR3_REG_FIRST_KILL) |
| ra_file_remove(file, interval); |
| } |
| |
| /* Insert a parallel copy instruction before the instruction with the parallel |
| * copy entries we've built up. |
| */ |
| static void |
| insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| if (ctx->parallel_copies_count == 0) |
| return; |
| |
| struct ir3_instruction *pcopy = |
| ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, |
| ctx->parallel_copies_count, ctx->parallel_copies_count); |
| |
| for (unsigned i = 0; i < ctx->parallel_copies_count; i++) { |
| struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; |
| struct ir3_register *reg = |
| ir3_dst_create(pcopy, INVALID_REG, |
| entry->interval->interval.reg->flags & |
| (IR3_REG_HALF | IR3_REG_ARRAY)); |
| reg->size = entry->interval->interval.reg->size; |
| reg->wrmask = entry->interval->interval.reg->wrmask; |
| assign_reg(pcopy, reg, ra_interval_get_num(entry->interval)); |
| } |
| |
| for (unsigned i = 0; i < ctx->parallel_copies_count; i++) { |
| struct ra_parallel_copy *entry = &ctx->parallel_copies[i]; |
| struct ir3_register *reg = |
| ir3_src_create(pcopy, INVALID_REG, |
| entry->interval->interval.reg->flags & |
| (IR3_REG_HALF | IR3_REG_ARRAY)); |
| reg->size = entry->interval->interval.reg->size; |
| reg->wrmask = entry->interval->interval.reg->wrmask; |
| assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags)); |
| } |
| |
| list_del(&pcopy->node); |
| list_addtail(&pcopy->node, &instr->node); |
| ctx->parallel_copies_count = 0; |
| } |
| |
| static void |
| handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| /* First, mark sources as going-to-be-killed while allocating the dest. */ |
| ra_foreach_src (src, instr) { |
| mark_src_killed(ctx, src); |
| } |
| |
| /* Pre-insert tied dst copies. */ |
| ra_foreach_dst (dst, instr) { |
| insert_tied_dst_copy(ctx, dst); |
| } |
| |
| /* Allocate the destination. */ |
| ra_foreach_dst (dst, instr) { |
| allocate_dst(ctx, dst); |
| } |
| |
| /* Now handle sources. Go backward so that in case there are multiple |
| * sources with the same def and that def is killed we only remove it at |
| * the end. |
| */ |
| ra_foreach_src_rev (src, instr) { |
| assign_src(ctx, instr, src); |
| } |
| |
| /* Now finally insert the destination into the map. */ |
| ra_foreach_dst (dst, instr) { |
| insert_dst(ctx, dst); |
| } |
| |
| insert_parallel_copy_instr(ctx, instr); |
| } |
| |
| static void |
| handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| struct ir3_register *dst = instr->dsts[0]; |
| struct ir3_register *src = instr->srcs[0]; |
| |
| if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) { |
| handle_normal_instr(ctx, instr); |
| return; |
| } |
| |
| struct ra_interval *src_interval = &ctx->intervals[src->def->name]; |
| |
| physreg_t physreg = ra_interval_get_physreg(src_interval); |
| assign_src(ctx, instr, src); |
| |
| allocate_dst_fixed( |
| ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset); |
| insert_dst(ctx, dst); |
| } |
| |
| static void |
| handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set; |
| unsigned dst_offset = instr->dsts[0]->merge_set_offset; |
| |
| if (!dst_set || dst_set->regs_count == 1) { |
| handle_normal_instr(ctx, instr); |
| return; |
| } |
| |
| /* We need to check if any of the sources are contained in an interval |
| * that is at least as large as the vector. In this case, we should put |
| * the vector inside that larger interval. (There should be one |
| * unambiguous place to put it, because values sharing the same merge set |
| * should be allocated together.) This can happen in a case like: |
| * |
| * ssa_1 (wrmask=0xf) = ... |
| * ssa_2 = split ssa_1 off:0 |
| * ssa_3 = split ssa_1 off:1 |
| * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3 |
| * ... = (kill)ssa_1 |
| * ... = (kill)ssa_4 |
| * |
| * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it. |
| */ |
| physreg_t dst_fixed = (physreg_t)~0u; |
| |
| ra_foreach_src (src, instr) { |
| if (src->flags & IR3_REG_FIRST_KILL) { |
| mark_src_killed(ctx, src); |
| } |
| |
| struct ra_interval *interval = &ctx->intervals[src->def->name]; |
| |
| if (src->def->merge_set != dst_set || interval->is_killed) |
| continue; |
| while (interval->interval.parent != NULL) { |
| interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); |
| } |
| if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) { |
| dst_fixed = interval->physreg_start - |
| interval->interval.reg->merge_set_offset + dst_offset; |
| } else { |
| /* For sources whose root interval is smaller than the |
| * destination (i.e. the normal case), we will shuffle them |
| * around after allocating the destination. Mark them killed so |
| * that the destination can be allocated over them, even if they |
| * aren't actually killed. |
| */ |
| ra_file_mark_killed(ra_get_file(ctx, src), interval); |
| } |
| } |
| |
| if (dst_fixed != (physreg_t)~0u) |
| allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed); |
| else |
| allocate_dst(ctx, instr->dsts[0]); |
| |
| /* Remove the temporary is_killed we added */ |
| ra_foreach_src (src, instr) { |
| struct ra_interval *interval = &ctx->intervals[src->def->name]; |
| while (interval->interval.parent != NULL) { |
| interval = ir3_reg_interval_to_ra_interval(interval->interval.parent); |
| } |
| |
| /* Filter out cases where it actually should be killed */ |
| if (interval != &ctx->intervals[src->def->name] || |
| !(src->flags & IR3_REG_KILL)) { |
| ra_file_unmark_killed(ra_get_file(ctx, src), interval); |
| } |
| } |
| |
| ra_foreach_src_rev (src, instr) { |
| assign_src(ctx, instr, src); |
| } |
| |
| /* We need to do this before insert_dst(), so that children of the |
| * destination which got marked as killed and then shuffled around to make |
| * space for the destination have the correct pcopy destination that |
| * matches what we assign the source of the collect to in assign_src(). |
| * |
| * TODO: In this case we'll wind up copying the value in the pcopy and |
| * then again in the collect. We could avoid one of those by updating the |
| * pcopy destination to match up with the final location of the source |
| * after the collect and making the collect a no-op. However this doesn't |
| * seem to happen often. |
| */ |
| insert_parallel_copy_instr(ctx, instr); |
| |
| /* Note: insert_dst will automatically shuffle around any intervals that |
| * are a child of the collect by making them children of the collect. |
| */ |
| |
| insert_dst(ctx, instr->dsts[0]); |
| } |
| |
| /* Parallel copies before RA should only be at the end of the block, for |
| * phi's. For these we only need to fill in the sources, and then we fill in |
| * the destinations in the successor block. |
| */ |
| static void |
| handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| ra_foreach_src_rev (src, instr) { |
| assign_src(ctx, instr, src); |
| } |
| } |
| |
| /* Some inputs may need to be precolored. We need to handle those first, so |
| * that other non-precolored inputs don't accidentally get allocated over |
| * them. Inputs are the very first thing in the shader, so it shouldn't be a |
| * problem to allocate them to a specific physreg. |
| */ |
| |
| static void |
| handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| if (instr->dsts[0]->num == INVALID_REG) |
| return; |
| |
| struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); |
| struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; |
| physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]); |
| allocate_dst_fixed(ctx, instr->dsts[0], physreg); |
| |
| d("insert precolored dst %u physreg %u", instr->dsts[0]->name, |
| ra_interval_get_physreg(interval)); |
| |
| ra_file_insert(file, interval); |
| interval->frozen = true; |
| } |
| |
| static void |
| handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| if (instr->dsts[0]->num != INVALID_REG) |
| return; |
| |
| allocate_dst(ctx, instr->dsts[0]); |
| |
| struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); |
| struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; |
| ra_file_insert(file, interval); |
| } |
| |
| static void |
| assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name]; |
| struct ra_file *file = ra_get_file(ctx, instr->dsts[0]); |
| |
| if (instr->dsts[0]->num == INVALID_REG) { |
| assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval)); |
| } else { |
| interval->frozen = false; |
| } |
| |
| if (instr->dsts[0]->flags & IR3_REG_UNUSED) |
| ra_file_remove(file, interval); |
| |
| ra_foreach_src_rev (src, instr) |
| assign_src(ctx, instr, src); |
| } |
| |
| /* chmask is a bit weird, because it has pre-colored sources due to the need |
| * to pass some registers to the next stage. Fortunately there are only at |
| * most two, and there should be no other live values by the time we get to |
| * this instruction, so we only have to do the minimum and don't need any |
| * fancy fallbacks. |
| * |
| * TODO: Add more complete handling of precolored sources, e.g. for function |
| * argument handling. We'd need a way to mark sources as fixed so that they |
| * don't get moved around when placing other sources in the fallback case, and |
| * a duplication of much of the logic in get_reg(). This also opens another |
| * can of worms, e.g. what if the precolored source is a split of a vector |
| * which is still live -- this breaks our assumption that splits don't incur |
| * any "extra" register requirements and we'd have to break it out of the |
| * parent ra_interval. |
| */ |
| |
| static void |
| handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src) |
| { |
| struct ra_file *file = ra_get_file(ctx, src); |
| struct ra_interval *interval = &ctx->intervals[src->def->name]; |
| physreg_t physreg = ra_reg_get_physreg(src); |
| |
| if (ra_interval_get_num(interval) == src->num) |
| return; |
| |
| /* Try evicting stuff in our way if it isn't free. This won't move |
| * anything unless it overlaps with our precolored physreg, so we don't |
| * have to worry about evicting other precolored sources. |
| */ |
| if (!get_reg_specified(ctx, file, src, physreg, true)) { |
| unsigned eviction_count; |
| if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true, |
| false)) { |
| unreachable("failed to evict for precolored source!"); |
| return; |
| } |
| } |
| |
| ra_move_interval(ctx, file, interval, physreg); |
| } |
| |
| static void |
| handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr) |
| { |
| /* Note: we purposely don't mark sources as killed, so that we can reuse |
| * some of the get_reg() machinery as-if the source is a destination. |
| * Marking it as killed would make e.g. get_reg_specified() wouldn't work |
| * correctly. |
| */ |
| ra_foreach_src (src, instr) { |
| assert(src->num != INVALID_REG); |
| handle_precolored_source(ctx, src); |
| } |
| |
| ra_foreach_src (src, instr) { |
| struct ra_file *file = ra_get_file(ctx, src); |
| struct ra_interval *interval = &ctx->intervals[src->def->name]; |
| if (src->flags & IR3_REG_FIRST_KILL) |
| ra_file_remove(file, interval); |
| } |
| |
| insert_parallel_copy_instr(ctx, instr); |
| } |
| |
| static physreg_t |
| read_register(struct ra_ctx *ctx, struct ir3_block *block, |
| struct ir3_register *def) |
| { |
| struct ra_block_state *state = &ctx->blocks[block->index]; |
| if (state->renames) { |
| struct hash_entry *entry = _mesa_hash_table_search(state->renames, def); |
| if (entry) { |
| return (physreg_t)(uintptr_t)entry->data; |
| } |
| } |
| |
| return ra_reg_get_physreg(def); |
| } |
| |
| static void |
| handle_live_in(struct ra_ctx *ctx, struct ir3_register *def) |
| { |
| physreg_t physreg = ~0; |
| for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { |
| struct ir3_block *pred = ctx->block->predecessors[i]; |
| struct ra_block_state *pred_state = &ctx->blocks[pred->index]; |
| |
| if (!pred_state->visited) |
| continue; |
| |
| physreg = read_register(ctx, pred, def); |
| break; |
| } |
| |
| assert(physreg != (physreg_t)~0); |
| |
| struct ra_interval *interval = &ctx->intervals[def->name]; |
| struct ra_file *file = ra_get_file(ctx, def); |
| ra_interval_init(interval, def); |
| interval->physreg_start = physreg; |
| interval->physreg_end = physreg + reg_size(def); |
| ra_file_insert(file, interval); |
| } |
| |
| static void |
| handle_live_out(struct ra_ctx *ctx, struct ir3_register *def) |
| { |
| /* Skip parallelcopy's which in the original program are only used as phi |
| * arguments. Even though phi arguments are live out, they are only |
| * assigned when the phi is. |
| */ |
| if (def->instr->opc == OPC_META_PARALLEL_COPY) |
| return; |
| |
| struct ra_block_state *state = &ctx->blocks[ctx->block->index]; |
| struct ra_interval *interval = &ctx->intervals[def->name]; |
| physreg_t physreg = ra_interval_get_physreg(interval); |
| if (physreg != ra_reg_get_physreg(def)) { |
| if (!state->renames) |
| state->renames = _mesa_pointer_hash_table_create(ctx); |
| _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg); |
| } |
| } |
| |
| static void |
| handle_phi(struct ra_ctx *ctx, struct ir3_register *def) |
| { |
| struct ra_file *file = ra_get_file(ctx, def); |
| struct ra_interval *interval = &ctx->intervals[def->name]; |
| |
| /* phis are always scalar, so they should already be the smallest possible |
| * size. However they may be coalesced with other live-in values/phi |
| * nodes, so check for that here. |
| */ |
| struct ir3_reg_interval *parent_ir3 = |
| ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start); |
| physreg_t physreg; |
| if (parent_ir3) { |
| struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3); |
| physreg = ra_interval_get_physreg(parent) + |
| (def->interval_start - parent_ir3->reg->interval_start); |
| } else { |
| physreg = get_reg(ctx, file, def); |
| } |
| |
| allocate_dst_fixed(ctx, def, physreg); |
| |
| ra_file_insert(file, interval); |
| } |
| |
| static void |
| assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi) |
| { |
| struct ra_file *file = ra_get_file(ctx, phi->dsts[0]); |
| struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name]; |
| assert(!interval->interval.parent); |
| unsigned num = ra_interval_get_num(interval); |
| assign_reg(phi, phi->dsts[0], num); |
| |
| /* Assign the parallelcopy sources of this phi */ |
| for (unsigned i = 0; i < phi->srcs_count; i++) { |
| if (phi->srcs[i]->def) { |
| assign_reg(phi, phi->srcs[i], num); |
| assign_reg(phi, phi->srcs[i]->def, num); |
| } |
| } |
| |
| if (phi->dsts[0]->flags & IR3_REG_UNUSED) |
| ra_file_remove(file, interval); |
| } |
| |
| /* When we split a live range, we sometimes need to emit fixup code at the end |
| * of a block. For example, something like: |
| * |
| * a = ... |
| * if (...) { |
| * ... |
| * a' = a |
| * b = ... // a evicted to make room for b |
| * ... |
| * } |
| * ... = a |
| * |
| * When we insert the copy to a' in insert_parallel_copy_instr(), this forces |
| * to insert another copy "a = a'" at the end of the if. Normally this would |
| * also entail adding a phi node, but since we're about to go out of SSA |
| * anyway we just insert an extra move. Note, however, that "b" might be used |
| * in a phi node at the end of the if and share registers with "a", so we |
| * have to be careful to extend any preexisting parallelcopy instruction |
| * instead of creating our own in order to guarantee that they properly get |
| * swapped. |
| */ |
| |
| static void |
| insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src, |
| struct ir3_register *reg) |
| { |
| struct ir3_instruction *old_pcopy = NULL; |
| if (!list_is_empty(&block->instr_list)) { |
| struct ir3_instruction *last = |
| list_entry(block->instr_list.prev, struct ir3_instruction, node); |
| if (last->opc == OPC_META_PARALLEL_COPY) |
| old_pcopy = last; |
| } |
| |
| unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0; |
| struct ir3_instruction *pcopy = ir3_instr_create( |
| block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1); |
| |
| for (unsigned i = 0; i < old_pcopy_srcs; i++) { |
| old_pcopy->dsts[i]->instr = pcopy; |
| pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i]; |
| } |
| |
| unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY); |
| |
| struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags); |
| dst_reg->wrmask = reg->wrmask; |
| dst_reg->size = reg->size; |
| assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags)); |
| |
| for (unsigned i = 0; i < old_pcopy_srcs; i++) { |
| pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i]; |
| } |
| |
| struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags); |
| src_reg->wrmask = reg->wrmask; |
| src_reg->size = reg->size; |
| assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags)); |
| |
| if (old_pcopy) |
| list_del(&old_pcopy->node); |
| } |
| |
| static void |
| insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval) |
| { |
| physreg_t physreg = ra_interval_get_physreg(interval); |
| |
| bool shared = interval->interval.reg->flags & IR3_REG_SHARED; |
| struct ir3_block **predecessors = |
| shared ? ctx->block->physical_predecessors : ctx->block->predecessors; |
| unsigned predecessors_count = shared |
| ? ctx->block->physical_predecessors_count |
| : ctx->block->predecessors_count; |
| |
| for (unsigned i = 0; i < predecessors_count; i++) { |
| struct ir3_block *pred = predecessors[i]; |
| struct ra_block_state *pred_state = &ctx->blocks[pred->index]; |
| |
| if (!pred_state->visited) |
| continue; |
| |
| physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg); |
| if (pred_reg != physreg) { |
| insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg); |
| |
| /* This is a bit tricky, but when visiting the destination of a |
| * physical-only edge, we have two predecessors (the if and the |
| * header block) and both have multiple successors. We pick the |
| * register for all live-ins from the normal edge, which should |
| * guarantee that there's no need for shuffling things around in |
| * the normal predecessor as long as there are no phi nodes, but |
| * we still may need to insert fixup code in the physical |
| * predecessor (i.e. the last block of the if) and that has |
| * another successor (the block after the if) so we need to update |
| * the renames state for when we process the other successor. This |
| * crucially depends on the other successor getting processed |
| * after this. |
| * |
| * For normal (non-physical) edges we disallow critical edges so |
| * that hacks like this aren't necessary. |
| */ |
| if (!pred_state->renames) |
| pred_state->renames = _mesa_pointer_hash_table_create(ctx); |
| _mesa_hash_table_insert(pred_state->renames, interval->interval.reg, |
| (void *)(uintptr_t)physreg); |
| } |
| } |
| } |
| |
| static void |
| insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file) |
| { |
| BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index]; |
| rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, |
| physreg_node) { |
| /* Skip phi nodes. This needs to happen after phi nodes are allocated, |
| * because we may have to move live-ins around to make space for phi |
| * nodes, but we shouldn't be handling phi nodes here. |
| */ |
| if (BITSET_TEST(live_in, interval->interval.reg->name)) |
| insert_live_in_move(ctx, interval); |
| } |
| } |
| |
| static void |
| insert_entry_regs(struct ra_block_state *state, struct ra_file *file) |
| { |
| rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, |
| physreg_node) { |
| _mesa_hash_table_insert(state->entry_regs, interval->interval.reg, |
| (void *)(uintptr_t)interval->physreg_start); |
| } |
| } |
| |
| static void |
| insert_live_in_moves(struct ra_ctx *ctx) |
| { |
| insert_file_live_in_moves(ctx, &ctx->full); |
| insert_file_live_in_moves(ctx, &ctx->half); |
| insert_file_live_in_moves(ctx, &ctx->shared); |
| |
| /* If not all predecessors are visited, insert live-in regs so that |
| * insert_live_out_moves() will work. |
| */ |
| bool all_preds_visited = true; |
| for (unsigned i = 0; i < ctx->block->predecessors_count; i++) { |
| if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) { |
| all_preds_visited = false; |
| break; |
| } |
| } |
| |
| if (!all_preds_visited) { |
| struct ra_block_state *state = &ctx->blocks[ctx->block->index]; |
| state->entry_regs = _mesa_pointer_hash_table_create(ctx); |
| |
| insert_entry_regs(state, &ctx->full); |
| insert_entry_regs(state, &ctx->half); |
| insert_entry_regs(state, &ctx->shared); |
| } |
| } |
| |
| static void |
| insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval) |
| { |
| for (unsigned i = 0; i < 2; i++) { |
| if (!ctx->block->successors[i]) |
| continue; |
| |
| struct ir3_block *succ = ctx->block->successors[i]; |
| struct ra_block_state *succ_state = &ctx->blocks[succ->index]; |
| |
| if (!succ_state->visited) |
| continue; |
| |
| struct hash_entry *entry = _mesa_hash_table_search( |
| succ_state->entry_regs, interval->interval.reg); |
| if (!entry) |
| continue; |
| |
| physreg_t new_reg = (physreg_t)(uintptr_t)entry->data; |
| if (new_reg != interval->physreg_start) { |
| insert_liveout_copy(ctx->block, new_reg, interval->physreg_start, |
| interval->interval.reg); |
| } |
| } |
| } |
| |
| static void |
| insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file) |
| { |
| rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals, |
| physreg_node) { |
| insert_live_out_move(ctx, interval); |
| } |
| } |
| |
| static void |
| insert_live_out_moves(struct ra_ctx *ctx) |
| { |
| insert_file_live_out_moves(ctx, &ctx->full); |
| insert_file_live_out_moves(ctx, &ctx->half); |
| insert_file_live_out_moves(ctx, &ctx->shared); |
| } |
| |
| static void |
| handle_block(struct ra_ctx *ctx, struct ir3_block *block) |
| { |
| ctx->block = block; |
| |
| /* Reset the register files from the last block */ |
| ra_file_init(&ctx->full); |
| ra_file_init(&ctx->half); |
| ra_file_init(&ctx->shared); |
| |
| /* Handle live-ins, phis, and input meta-instructions. These all appear |
| * live at the beginning of the block, and interfere with each other |
| * therefore need to be allocated "in parallel". This means that we |
| * have to allocate all of them, inserting them into the file, and then |
| * delay updating the IR until all of them are allocated. |
| * |
| * Handle precolored inputs first, because we need to make sure that other |
| * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes |
| * and inputs at the same time, because the first block doesn't have |
| * predecessors. Therefore handle_live_in doesn't have to worry about |
| * them. |
| */ |
| |
| foreach_instr (instr, &block->instr_list) { |
| if (instr->opc == OPC_META_INPUT) |
| handle_precolored_input(ctx, instr); |
| else |
| break; |
| } |
| |
| unsigned name; |
| BITSET_FOREACH_SET (name, ctx->live->live_in[block->index], |
| ctx->live->definitions_count) { |
| struct ir3_register *reg = ctx->live->definitions[name]; |
| handle_live_in(ctx, reg); |
| } |
| |
| foreach_instr (instr, &block->instr_list) { |
| if (instr->opc == OPC_META_PHI) |
| handle_phi(ctx, instr->dsts[0]); |
| else if (instr->opc == OPC_META_INPUT || |
| instr->opc == OPC_META_TEX_PREFETCH) |
| handle_input(ctx, instr); |
| else |
| break; |
| } |
| |
| /* After this point, every live-in/phi/input has an interval assigned to |
| * it. We delay actually assigning values until everything has been |
| * allocated, so we can simply ignore any parallel copy entries created |
| * when shuffling them around. |
| */ |
| ctx->parallel_copies_count = 0; |
| |
| insert_live_in_moves(ctx); |
| |
| if (RA_DEBUG) { |
| d("after live-in block %u:\n", block->index); |
| ra_ctx_dump(ctx); |
| } |
| |
| /* Now we're done with processing live-ins, and can handle the body of the |
| * block. |
| */ |
| foreach_instr (instr, &block->instr_list) { |
| di(instr, "processing"); |
| |
| if (instr->opc == OPC_META_PHI) |
| assign_phi(ctx, instr); |
| else if (instr->opc == OPC_META_INPUT || |
| instr->opc == OPC_META_TEX_PREFETCH) |
| assign_input(ctx, instr); |
| else if (instr->opc == OPC_META_SPLIT) |
| handle_split(ctx, instr); |
| else if (instr->opc == OPC_META_COLLECT) |
| handle_collect(ctx, instr); |
| else if (instr->opc == OPC_META_PARALLEL_COPY) |
| handle_pcopy(ctx, instr); |
| else if (instr->opc == OPC_CHMASK) |
| handle_chmask(ctx, instr); |
| else |
| handle_normal_instr(ctx, instr); |
| |
| if (RA_DEBUG) |
| ra_ctx_dump(ctx); |
| } |
| |
| insert_live_out_moves(ctx); |
| |
| BITSET_FOREACH_SET (name, ctx->live->live_out[block->index], |
| ctx->live->definitions_count) { |
| struct ir3_register *reg = ctx->live->definitions[name]; |
| handle_live_out(ctx, reg); |
| } |
| |
| ctx->blocks[block->index].visited = true; |
| } |
| |
| static unsigned |
| calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure) |
| { |
| /* Registers are allocated in units of vec4, so switch from units of |
| * half-regs to vec4. |
| */ |
| unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4); |
| |
| bool double_threadsize = ir3_should_double_threadsize(v, reg_count); |
| |
| unsigned target = reg_count; |
| unsigned reg_independent_max_waves = |
| ir3_get_reg_independent_max_waves(v, double_threadsize); |
| unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves( |
| v->compiler, reg_count, double_threadsize); |
| unsigned target_waves = |
| MIN2(reg_independent_max_waves, reg_dependent_max_waves); |
| |
| while (target <= RA_FULL_SIZE / (2 * 4) && |
| ir3_should_double_threadsize(v, target) == double_threadsize && |
| ir3_get_reg_dependent_max_waves(v->compiler, target, |
| double_threadsize) >= target_waves) |
| target++; |
| |
| return (target - 1) * 2 * 4; |
| } |
| |
| static void |
| add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg, |
| bool merged_regs) |
| { |
| unsigned size = reg_size(reg); |
| if (reg->flags & IR3_REG_HALF) |
| pressure->half += size; |
| if (!(reg->flags & IR3_REG_HALF) || merged_regs) |
| pressure->full += size; |
| } |
| |
| static void |
| dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) |
| { |
| } |
| |
| static void |
| dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval) |
| { |
| } |
| |
| static void |
| dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent, |
| struct ir3_reg_interval *child) |
| { |
| } |
| |
| /* Calculate the minimum possible limit on register pressure so that spilling |
| * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall. |
| */ |
| |
| static void |
| calc_min_limit_pressure(struct ir3_shader_variant *v, |
| struct ir3_liveness *live, |
| struct ir3_pressure *limit) |
| { |
| struct ir3_block *start = ir3_start_block(v->ir); |
| struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx); |
| struct ir3_reg_interval *intervals = |
| rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count); |
| |
| ctx->interval_add = dummy_interval_add; |
| ctx->interval_delete = dummy_interval_delete; |
| ctx->interval_readd = dummy_interval_readd; |
| |
| limit->full = limit->half = 0; |
| |
| struct ir3_pressure cur_pressure = {0}; |
| foreach_instr (input, &start->instr_list) { |
| if (input->opc != OPC_META_INPUT && |
| input->opc != OPC_META_TEX_PREFETCH) |
| break; |
| |
| add_pressure(&cur_pressure, input->dsts[0], v->mergedregs); |
| } |
| |
| limit->full = MAX2(limit->full, cur_pressure.full); |
| limit->half = MAX2(limit->half, cur_pressure.half); |
| |
| foreach_instr (input, &start->instr_list) { |
| if (input->opc != OPC_META_INPUT && |
| input->opc != OPC_META_TEX_PREFETCH) |
| break; |
| |
| /* pre-colored inputs may have holes, which increases the pressure. */ |
| struct ir3_register *dst = input->dsts[0]; |
| if (dst->num != INVALID_REG) { |
| unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst); |
| if (dst->flags & IR3_REG_HALF) |
| limit->half = MAX2(limit->half, physreg); |
| if (!(dst->flags & IR3_REG_HALF) || v->mergedregs) |
| limit->full = MAX2(limit->full, physreg); |
| } |
| } |
| |
| foreach_block (block, &v->ir->block_list) { |
| rb_tree_init(&ctx->intervals); |
| |
| unsigned name; |
| BITSET_FOREACH_SET (name, live->live_in[block->index], |
| live->definitions_count) { |
| struct ir3_register *reg = live->definitions[name]; |
| ir3_reg_interval_init(&intervals[reg->name], reg); |
| ir3_reg_interval_insert(ctx, &intervals[reg->name]); |
| } |
| |
| foreach_instr (instr, &block->instr_list) { |
| ra_foreach_dst (dst, instr) { |
| ir3_reg_interval_init(&intervals[dst->name], dst); |
| } |
| /* phis and parallel copies can be deleted via spilling */ |
| |
| if (instr->opc == OPC_META_PHI) { |
| ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]); |
| continue; |
| } |
| |
| if (instr->opc == OPC_META_PARALLEL_COPY) |
| continue; |
| |
| cur_pressure = (struct ir3_pressure) {0}; |
| |
| ra_foreach_dst (dst, instr) { |
| if (dst->tied && !(dst->tied->flags & IR3_REG_KILL)) |
| add_pressure(&cur_pressure, dst, v->mergedregs); |
| } |
| |
| ra_foreach_src_rev (src, instr) { |
| /* We currently don't support spilling the parent of a source when |
| * making space for sources, so we have to keep track of the |
| * intervals and figure out the root of the tree to figure out how |
| * much space we need. |
| * |
| * TODO: We should probably support this in the spiller. |
| */ |
| struct ir3_reg_interval *interval = &intervals[src->def->name]; |
| while (interval->parent) |
| interval = interval->parent; |
| add_pressure(&cur_pressure, interval->reg, v->mergedregs); |
| |
| if (src->flags & IR3_REG_FIRST_KILL) |
| ir3_reg_interval_remove(ctx, &intervals[src->def->name]); |
| } |
| |
| limit->full = MAX2(limit->full, cur_pressure.full); |
| limit->half = MAX2(limit->half, cur_pressure.half); |
| |
| cur_pressure = (struct ir3_pressure) {0}; |
| |
| ra_foreach_dst (dst, instr) { |
| ir3_reg_interval_init(&intervals[dst->name], dst); |
| ir3_reg_interval_insert(ctx, &intervals[dst->name]); |
| add_pressure(&cur_pressure, dst, v->mergedregs); |
| } |
| |
| limit->full = MAX2(limit->full, cur_pressure.full); |
| limit->half = MAX2(limit->half, cur_pressure.half); |
| } |
| } |
| |
| /* Account for the base register, which needs to be available everywhere. */ |
| limit->full += 2; |
| |
| ralloc_free(ctx); |
| } |
| |
| /* |
| * If barriers are used, it must be possible for all waves in the workgroup |
| * to execute concurrently. Thus we may have to reduce the registers limit. |
| */ |
| static void |
| calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v, |
| struct ir3_pressure *limit_pressure) |
| { |
| const struct ir3_compiler *compiler = v->compiler; |
| |
| unsigned threads_per_wg; |
| if (v->local_size_variable) { |
| /* We have to expect the worst case. */ |
| threads_per_wg = compiler->max_variable_workgroup_size; |
| } else { |
| threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2]; |
| } |
| |
| /* The register file is grouped into reg_size_vec4 number of parts. |
| * Each part has enough registers to add a single vec4 register to |
| * each thread of a single-sized wave-pair. With double threadsize |
| * each wave-pair would consume two parts of the register file to get |
| * a single vec4 for a thread. The more active wave-pairs the less |
| * parts each could get. |
| */ |
| |
| bool double_threadsize = ir3_should_double_threadsize(v, 0); |
| unsigned waves_per_wg = DIV_ROUND_UP( |
| threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) * |
| compiler->wave_granularity); |
| |
| uint32_t vec4_regs_per_thread = |
| compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1)); |
| assert(vec4_regs_per_thread > 0); |
| |
| uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2; |
| |
| if (limit_pressure->full > half_regs_per_thread) { |
| if (v->mergedregs) { |
| limit_pressure->full = half_regs_per_thread; |
| } else { |
| /* TODO: Handle !mergedregs case, probably we would have to do this |
| * after the first register pressure pass. |
| */ |
| } |
| } |
| } |
| |
| int |
| ir3_ra(struct ir3_shader_variant *v) |
| { |
| ir3_calc_dominance(v->ir); |
| |
| ir3_create_parallel_copies(v->ir); |
| |
| struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx); |
| |
| ctx->merged_regs = v->mergedregs; |
| ctx->compiler = v->compiler; |
| ctx->stage = v->type; |
| |
| struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir); |
| |
| ir3_debug_print(v->ir, "AFTER: create_parallel_copies"); |
| |
| ir3_merge_regs(live, v->ir); |
| |
| struct ir3_pressure max_pressure; |
| ir3_calc_pressure(v, live, &max_pressure); |
| d("max pressure:"); |
| d("\tfull: %u", max_pressure.full); |
| d("\thalf: %u", max_pressure.half); |
| d("\tshared: %u", max_pressure.shared); |
| |
| struct ir3_pressure limit_pressure; |
| limit_pressure.full = RA_FULL_SIZE; |
| limit_pressure.half = RA_HALF_SIZE; |
| limit_pressure.shared = RA_SHARED_SIZE; |
| |
| if (gl_shader_stage_is_compute(v->type) && v->has_barrier) { |
| calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure); |
| } |
| |
| /* If the user forces a doubled threadsize, we may have to lower the limit |
| * because on some gens the register file is not big enough to hold a |
| * double-size wave with all 48 registers in use. |
| */ |
| if (v->real_wavesize == IR3_DOUBLE_ONLY) { |
| limit_pressure.full = |
| MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16); |
| } |
| |
| /* If requested, lower the limit so that spilling happens more often. */ |
| if (ir3_shader_debug & IR3_DBG_SPILLALL) |
| calc_min_limit_pressure(v, live, &limit_pressure); |
| |
| if (max_pressure.shared > limit_pressure.shared) { |
| /* TODO shared reg -> normal reg spilling */ |
| d("shared max pressure exceeded!"); |
| goto fail; |
| } |
| |
| bool spilled = false; |
| if (max_pressure.full > limit_pressure.full || |
| max_pressure.half > limit_pressure.half) { |
| if (!v->compiler->has_pvtmem) { |
| d("max pressure exceeded!"); |
| goto fail; |
| } |
| d("max pressure exceeded, spilling!"); |
| IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure); |
| ir3_calc_pressure(v, live, &max_pressure); |
| assert(max_pressure.full <= limit_pressure.full && |
| max_pressure.half <= limit_pressure.half); |
| spilled = true; |
| } |
| |
| ctx->live = live; |
| ctx->intervals = |
| rzalloc_array(ctx, struct ra_interval, live->definitions_count); |
| ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count); |
| |
| ctx->full.size = calc_target_full_pressure(v, max_pressure.full); |
| d("full size: %u", ctx->full.size); |
| |
| if (!v->mergedregs) |
| ctx->half.size = RA_HALF_SIZE; |
| |
| ctx->shared.size = RA_SHARED_SIZE; |
| |
| ctx->full.start = ctx->half.start = ctx->shared.start = 0; |
| |
| foreach_block (block, &v->ir->block_list) |
| handle_block(ctx, block); |
| |
| ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count); |
| |
| /* Strip array-ness and SSA-ness at the end, because various helpers still |
| * need to work even on definitions that have already been assigned. For |
| * example, we need to preserve array-ness so that array live-ins have the |
| * right size. |
| */ |
| foreach_block (block, &v->ir->block_list) { |
| foreach_instr (instr, &block->instr_list) { |
| for (unsigned i = 0; i < instr->dsts_count; i++) { |
| instr->dsts[i]->flags &= ~IR3_REG_SSA; |
| |
| /* Parallel copies of array registers copy the whole register, and |
| * we need some way to let the parallel copy code know that this was |
| * an array whose size is determined by reg->size. So keep the array |
| * flag on those. spill/reload also need to work on the entire |
| * array. |
| */ |
| if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO) |
| instr->dsts[i]->flags &= ~IR3_REG_ARRAY; |
| } |
| |
| for (unsigned i = 0; i < instr->srcs_count; i++) { |
| instr->srcs[i]->flags &= ~IR3_REG_SSA; |
| |
| if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO) |
| instr->srcs[i]->flags &= ~IR3_REG_ARRAY; |
| } |
| } |
| } |
| |
| ir3_debug_print(v->ir, "AFTER: register allocation"); |
| |
| if (spilled) { |
| IR3_PASS(v->ir, ir3_lower_spill); |
| } |
| |
| ir3_lower_copies(v); |
| |
| ir3_debug_print(v->ir, "AFTER: ir3_lower_copies"); |
| |
| ralloc_free(ctx); |
| |
| return 0; |
| fail: |
| ralloc_free(ctx); |
| return -1; |
| } |