/* * Copyright © 2019 Broadcom * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "nir_schedule.h" #include "util/dag.h" #include "util/u_dynarray.h" /** @file * * Implements basic-block-level prepass instruction scheduling in NIR to * manage register pressure. * * This is based on the Goodman/Hsu paper (1988, cached copy at * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We * make up the DDG for NIR (which can be mostly done using the NIR def/use * chains for SSA instructions, plus some edges for ordering register writes * vs reads, and some more for ordering intrinsics). Then we pick heads off * of the DDG using their heuristic to emit the NIR instructions back into the * block in their new order. * * The hard case for prepass scheduling on GPUs seems to always be consuming * texture/ubo results. The register pressure heuristic doesn't want to pick * an instr that starts consuming texture results because it usually won't be * the only usage, so that instruction will increase pressure. * * If you try to force consumption of tex results always, then in a case where * single sample is used for many outputs, you'll end up picking every other * user and expanding register pressure. The partially_evaluated_path flag * helps tremendously, in that if you happen for whatever reason to pick a * texture sample's output, then you'll try to finish off that sample. Future * work may include doing some local search before locking in a choice, to try * to more reliably find the case where just a few choices going against the * heuristic can manage to free the whole vector. */ static bool debug; /** * Represents a node in the DDG for a NIR instruction. */ typedef struct { struct dag_node dag; /* must be first for our u_dynarray_foreach */ nir_instr *instr; bool partially_evaluated_path; /* Approximate estimate of the delay between starting this instruction and * its results being available. * * Accuracy is not too important, given that we're prepass scheduling here * and just trying to reduce excess dependencies introduced by a register * allocator by stretching out the live intervals of expensive * instructions. */ uint32_t delay; /* Cost of the maximum-delay path from this node to the leaves. */ uint32_t max_delay; /* scoreboard->time value when this instruction can be scheduled without * any stalls expected. */ uint32_t ready_time; } nir_schedule_node; typedef struct { struct dag *dag; nir_shader *shader; /* Mapping from nir_register * or nir_ssa_def * to a struct set of * instructions remaining to be scheduled using the register. */ struct hash_table *remaining_uses; /* Map from nir_instr to nir_schedule_node * */ struct hash_table *instr_map; /* Set of nir_register * or nir_ssa_def * that have had any instruction * scheduled on them. */ struct set *live_values; /* An abstract approximation of the number of nir_scheduler_node->delay * units since the start of the shader. */ uint32_t time; /* Number of channels currently used by the NIR instructions that have been * scheduled. */ int pressure; /* Options specified by the backend */ const nir_schedule_options *options; } nir_schedule_scoreboard; /* When walking the instructions in reverse, we use this flag to swap * before/after in add_dep(). */ enum direction { F, R }; struct nir_schedule_class_dep { int klass; nir_schedule_node *node; struct nir_schedule_class_dep *next; }; typedef struct { nir_schedule_scoreboard *scoreboard; /* Map from nir_register to nir_schedule_node * */ struct hash_table *reg_map; /* Scheduler nodes for last instruction involved in some class of dependency. */ nir_schedule_node *load_input; nir_schedule_node *store_shared; nir_schedule_node *unknown_intrinsic; nir_schedule_node *discard; nir_schedule_node *jump; struct nir_schedule_class_dep *class_deps; enum direction dir; } nir_deps_state; static void * _mesa_hash_table_search_data(struct hash_table *ht, void *key) { struct hash_entry *entry = _mesa_hash_table_search(ht, key); if (!entry) return NULL; return entry->data; } static nir_schedule_node * nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr) { return _mesa_hash_table_search_data(instr_map, instr); } static struct set * nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src) { if (src->is_ssa) { return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa); } else { return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->reg.reg); } } static int nir_schedule_def_pressure(nir_ssa_def *def) { return def->num_components; } static int nir_schedule_src_pressure(nir_src *src) { if (src->is_ssa) return nir_schedule_def_pressure(src->ssa); else return src->reg.reg->num_components; } static int nir_schedule_dest_pressure(nir_dest *dest) { if (dest->is_ssa) return nir_schedule_def_pressure(&dest->ssa); else return dest->reg.reg->num_components; } /** * Adds a dependency such that @after must appear in the final program after * @before. * * We add @before as a child of @after, so that DAG heads are the outputs of * the program and we make our scheduling decisions bottom to top. */ static void add_dep(nir_deps_state *state, nir_schedule_node *before, nir_schedule_node *after) { if (!before || !after) return; assert(before != after); if (state->dir == F) dag_add_edge(&before->dag, &after->dag, NULL); else dag_add_edge(&after->dag, &before->dag, NULL); } static void add_read_dep(nir_deps_state *state, nir_schedule_node *before, nir_schedule_node *after) { add_dep(state, before, after); } static void add_write_dep(nir_deps_state *state, nir_schedule_node **before, nir_schedule_node *after) { add_dep(state, *before, after); *before = after; } static bool nir_schedule_reg_src_deps(nir_src *src, void *in_state) { nir_deps_state *state = in_state; if (src->is_ssa) return true; struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, src->reg.reg); if (!entry) return true; nir_schedule_node *dst_n = entry->data; nir_schedule_node *src_n = nir_schedule_get_node(state->scoreboard->instr_map, src->parent_instr); add_dep(state, dst_n, src_n); return true; } static bool nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state) { nir_deps_state *state = in_state; if (dest->is_ssa) return true; nir_schedule_node *dest_n = nir_schedule_get_node(state->scoreboard->instr_map, dest->reg.parent_instr); struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, dest->reg.reg); if (!entry) { _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n); return true; } nir_schedule_node **before = (nir_schedule_node **)&entry->data; add_write_dep(state, before, dest_n); return true; } static bool nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state) { nir_deps_state *state = in_state; struct hash_table *instr_map = state->scoreboard->instr_map; nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr); nir_foreach_use(src, def) { nir_schedule_node *use_n = nir_schedule_get_node(instr_map, src->parent_instr); add_read_dep(state, def_n, use_n); } return true; } static struct nir_schedule_class_dep * nir_schedule_get_class_dep(nir_deps_state *state, int klass) { for (struct nir_schedule_class_dep *class_dep = state->class_deps; class_dep != NULL; class_dep = class_dep->next) { if (class_dep->klass == klass) return class_dep; } struct nir_schedule_class_dep *class_dep = ralloc(state->reg_map, struct nir_schedule_class_dep); class_dep->klass = klass; class_dep->node = NULL; class_dep->next = state->class_deps; state->class_deps = class_dep; return class_dep; } static void nir_schedule_intrinsic_deps(nir_deps_state *state, nir_intrinsic_instr *instr) { nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map, &instr->instr); const nir_schedule_options *options = state->scoreboard->options; nir_schedule_dependency dep; if (options->intrinsic_cb && options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) { struct nir_schedule_class_dep *class_dep = nir_schedule_get_class_dep(state, dep.klass); switch (dep.type) { case NIR_SCHEDULE_READ_DEPENDENCY: add_read_dep(state, class_dep->node, n); break; case NIR_SCHEDULE_WRITE_DEPENDENCY: add_write_dep(state, &class_dep->node, n); break; } } switch (instr->intrinsic) { case nir_intrinsic_load_uniform: case nir_intrinsic_load_ubo: case nir_intrinsic_load_front_face: break; case nir_intrinsic_discard: case nir_intrinsic_discard_if: case nir_intrinsic_demote: case nir_intrinsic_demote_if: case nir_intrinsic_terminate: case nir_intrinsic_terminate_if: /* We are adding two dependencies: * * * A individual one that we could use to add a read_dep while handling * nir_instr_type_tex * * * Include it on the unknown intrinsic set, as we want discard to be * serialized in in the same order relative to intervening stores or * atomic accesses to SSBOs and images */ add_write_dep(state, &state->discard, n); add_write_dep(state, &state->unknown_intrinsic, n); break; case nir_intrinsic_store_output: /* For some hardware and stages, output stores affect the same shared * memory as input loads. */ if ((state->scoreboard->options->stages_with_shared_io_memory & (1 << state->scoreboard->shader->info.stage))) add_write_dep(state, &state->load_input, n); /* Make sure that preceding discards stay before the store_output */ add_read_dep(state, state->discard, n); break; case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_input: add_read_dep(state, state->load_input, n); break; case nir_intrinsic_load_shared: /* Don't move load_shared beyond a following store_shared, as it could * change their value */ add_read_dep(state, state->store_shared, n); break; case nir_intrinsic_store_shared: add_write_dep(state, &state->store_shared, n); break; case nir_intrinsic_control_barrier: case nir_intrinsic_memory_barrier_shared: add_write_dep(state, &state->store_shared, n); /* Serialize against ssbos/atomics/etc. */ add_write_dep(state, &state->unknown_intrinsic, n); break; default: /* Attempt to handle other intrinsics that we haven't individually * categorized by serializing them in the same order relative to each * other. */ add_write_dep(state, &state->unknown_intrinsic, n); break; } } /** * Common code for dependencies that need to be tracked both forward and * backward. * * This is for things like "all reads of r4 have to happen between the r4 * writes that surround them". */ static void nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n) { nir_instr *instr = n->instr; /* For NIR SSA defs, we only need to do a single pass of making the uses * depend on the def. */ if (state->dir == F) nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state); /* For NIR regs, track the last writer in the scheduler state so that we * can keep the writes in order and let reads get reordered only between * each write. */ nir_foreach_src(instr, nir_schedule_reg_src_deps, state); nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state); /* Make sure any other instructions keep their positions relative to * jumps. */ if (instr->type != nir_instr_type_jump) add_read_dep(state, state->jump, n); switch (instr->type) { case nir_instr_type_ssa_undef: case nir_instr_type_load_const: case nir_instr_type_alu: case nir_instr_type_deref: break; case nir_instr_type_tex: /* Don't move texture ops before a discard, as that could increase * memory bandwidth for reading the discarded samples. */ add_read_dep(state, state->discard, n); break; case nir_instr_type_jump: add_write_dep(state, &state->jump, n); break; case nir_instr_type_call: unreachable("Calls should have been lowered"); break; case nir_instr_type_parallel_copy: unreachable("Parallel copies should have been lowered"); break; case nir_instr_type_phi: unreachable("nir_schedule() should be called after lowering from SSA"); break; case nir_instr_type_intrinsic: nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr)); break; } } static void calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) { nir_deps_state state = { .scoreboard = scoreboard, .dir = F, .reg_map = _mesa_pointer_hash_table_create(NULL), }; nir_foreach_instr(instr, block) { nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, instr); nir_schedule_calculate_deps(&state, node); } ralloc_free(state.reg_map); } static void calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) { nir_deps_state state = { .scoreboard = scoreboard, .dir = R, .reg_map = _mesa_pointer_hash_table_create(NULL), }; nir_foreach_instr_reverse(instr, block) { nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, instr); nir_schedule_calculate_deps(&state, node); } ralloc_free(state.reg_map); } typedef struct { nir_schedule_scoreboard *scoreboard; int regs_freed; } nir_schedule_regs_freed_state; static bool nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state) { nir_schedule_regs_freed_state *state = in_state; nir_schedule_scoreboard *scoreboard = state->scoreboard; struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); if (remaining_uses->entries == 1 && _mesa_set_search(remaining_uses, src->parent_instr)) { state->regs_freed += nir_schedule_src_pressure(src); } return true; } static bool nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state) { nir_schedule_regs_freed_state *state = in_state; state->regs_freed -= nir_schedule_def_pressure(def); return true; } static bool nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state) { nir_schedule_regs_freed_state *state = in_state; nir_schedule_scoreboard *scoreboard = state->scoreboard; if (dest->is_ssa) return true; nir_register *reg = dest->reg.reg; /* Only the first def of a reg counts against register pressure. */ if (!_mesa_set_search(scoreboard->live_values, reg)) state->regs_freed -= nir_schedule_dest_pressure(dest); return true; } static int nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n) { nir_schedule_regs_freed_state state = { .scoreboard = scoreboard, }; nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state); nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state); nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state); return state.regs_freed; } /** * Chooses an instruction that will minimise the register pressure as much as * possible. This should only be used as a fallback when the regular scheduling * generates a shader whose register allocation fails. */ static nir_schedule_node * nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard) { nir_schedule_node *chosen = NULL; /* Find the leader in the ready (shouldn't-stall) set with the mininum * cost. */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (scoreboard->time < n->ready_time) continue; if (!chosen || chosen->max_delay > n->max_delay) chosen = n; } if (chosen) { if (debug) { fprintf(stderr, "chose (ready fallback): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /* Otherwise, choose the leader with the minimum cost. */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (!chosen || chosen->max_delay > n->max_delay) chosen = n; } if (debug) { fprintf(stderr, "chose (leader fallback): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /** * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code * Scheduling for Parallelism) heuristic. * * Picks an instruction on the critical that's ready to execute without * stalls, if possible, otherwise picks the instruction on the critical path. */ static nir_schedule_node * nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard) { nir_schedule_node *chosen = NULL; /* Find the leader in the ready (shouldn't-stall) set with the maximum * cost. */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (scoreboard->time < n->ready_time) continue; if (!chosen || chosen->max_delay < n->max_delay) chosen = n; } if (chosen) { if (debug) { fprintf(stderr, "chose (ready): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /* Otherwise, choose the leader with the maximum cost. */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (!chosen || chosen->max_delay < n->max_delay) chosen = n; } if (debug) { fprintf(stderr, "chose (leader): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /** * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code * Scheduling for Register pressure) heuristic. */ static nir_schedule_node * nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard) { nir_schedule_node *chosen = NULL; /* Find a ready inst with regs freed and pick the one with max cost. */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (n->ready_time > scoreboard->time) continue; int regs_freed = nir_schedule_regs_freed(scoreboard, n); if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { chosen = n; } } if (chosen) { if (debug) { fprintf(stderr, "chose (freed+ready): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /* Find a leader with regs freed and pick the one with max cost. */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { int regs_freed = nir_schedule_regs_freed(scoreboard, n); if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { chosen = n; } } if (chosen) { if (debug) { fprintf(stderr, "chose (regs freed): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /* Find a partially evaluated path and try to finish it off */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (n->partially_evaluated_path && (!chosen || chosen->max_delay < n->max_delay)) { chosen = n; } } if (chosen) { if (debug) { fprintf(stderr, "chose (partial path): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /* Contra the paper, pick a leader with no effect on used regs. This may * open up new opportunities, as otherwise a single-operand instr consuming * a value will tend to block finding freeing that value. This had a * massive effect on reducing spilling on V3D. * * XXX: Should this prioritize ready? */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (nir_schedule_regs_freed(scoreboard, n) != 0) continue; if (!chosen || chosen->max_delay < n->max_delay) chosen = n; } if (chosen) { if (debug) { fprintf(stderr, "chose (regs no-op): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /* Pick the max delay of the remaining ready set. */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (n->ready_time > scoreboard->time) continue; if (!chosen || chosen->max_delay < n->max_delay) chosen = n; } if (chosen) { if (debug) { fprintf(stderr, "chose (ready max delay): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } /* Pick the max delay of the remaining leaders. */ list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { if (!chosen || chosen->max_delay < n->max_delay) chosen = n; } if (debug) { fprintf(stderr, "chose (max delay): "); nir_print_instr(chosen->instr, stderr); fprintf(stderr, "\n"); } return chosen; } static void dump_state(nir_schedule_scoreboard *scoreboard) { list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { fprintf(stderr, "maxdel %5d ", n->max_delay); nir_print_instr(n->instr, stderr); fprintf(stderr, "\n"); util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { nir_schedule_node *child = (nir_schedule_node *)edge->child; fprintf(stderr, " -> (%d parents) ", child->dag.parent_count); nir_print_instr(child->instr, stderr); fprintf(stderr, "\n"); } } } static void nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard, void *reg_or_def, nir_instr *reg_or_def_parent, int pressure) { /* Make the value live if it's the first time it's been used. */ if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) { _mesa_set_add(scoreboard->live_values, reg_or_def); scoreboard->pressure += pressure; } /* Make the value dead if it's the last remaining use. Be careful when one * instruction uses a value twice to not decrement pressure twice. */ struct set *remaining_uses = _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def); struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent); if (entry) { _mesa_set_remove(remaining_uses, entry); if (remaining_uses->entries == 0) scoreboard->pressure -= pressure; } } static bool nir_schedule_mark_src_scheduled(nir_src *src, void *state) { nir_schedule_scoreboard *scoreboard = state; struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); struct set_entry *entry = _mesa_set_search(remaining_uses, src->parent_instr); if (entry) { /* Once we've used an SSA value in one instruction, bump the priority of * the other uses so the SSA value can get fully consumed. * * We don't do this for registers, and it's would be a hassle and it's * unclear if that would help or not. Also, skip it for constants, as * they're often folded as immediates into backend instructions and have * many unrelated instructions all referencing the same value (0). */ if (src->is_ssa && src->ssa->parent_instr->type != nir_instr_type_load_const) { nir_foreach_use(other_src, src->ssa) { if (other_src->parent_instr == src->parent_instr) continue; nir_schedule_node *n = nir_schedule_get_node(scoreboard->instr_map, other_src->parent_instr); if (n && !n->partially_evaluated_path) { if (debug) { fprintf(stderr, " New partially evaluated path: "); nir_print_instr(n->instr, stderr); fprintf(stderr, "\n"); } n->partially_evaluated_path = true; } } } } nir_schedule_mark_use(scoreboard, src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg, src->parent_instr, nir_schedule_src_pressure(src)); return true; } static bool nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state) { nir_schedule_scoreboard *scoreboard = state; nir_schedule_mark_use(scoreboard, def, def->parent_instr, nir_schedule_def_pressure(def)); return true; } static bool nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state) { nir_schedule_scoreboard *scoreboard = state; /* SSA defs were handled in nir_schedule_mark_def_scheduled() */ if (dest->is_ssa) return true; /* XXX: This is not actually accurate for regs -- the last use of a reg may * have a live interval that extends across control flow. We should * calculate the live ranges of regs, and have scheduler nodes for the CF * nodes that also "use" the reg. */ nir_schedule_mark_use(scoreboard, dest->reg.reg, dest->reg.parent_instr, nir_schedule_dest_pressure(dest)); return true; } static void nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n) { nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard); nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard); nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard); util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { nir_schedule_node *child = (nir_schedule_node *)edge->child; child->ready_time = MAX2(child->ready_time, scoreboard->time + n->delay); if (child->dag.parent_count == 1) { if (debug) { fprintf(stderr, " New DAG head: "); nir_print_instr(child->instr, stderr); fprintf(stderr, "\n"); } } } dag_prune_head(scoreboard->dag, &n->dag); scoreboard->time = MAX2(n->ready_time, scoreboard->time); scoreboard->time++; } static void nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block) { while (!list_is_empty(&scoreboard->dag->heads)) { if (debug) { fprintf(stderr, "current list:\n"); dump_state(scoreboard); } nir_schedule_node *chosen; if (scoreboard->options->fallback) chosen = nir_schedule_choose_instruction_fallback(scoreboard); else if (scoreboard->pressure < scoreboard->options->threshold) chosen = nir_schedule_choose_instruction_csp(scoreboard); else chosen = nir_schedule_choose_instruction_csr(scoreboard); /* Now that we've scheduled a new instruction, some of its children may * be promoted to the list of instructions ready to be scheduled. */ nir_schedule_mark_node_scheduled(scoreboard, chosen); /* Move the instruction to the end (so our first chosen instructions are * the start of the program). */ exec_node_remove(&chosen->instr->node); exec_list_push_tail(&block->instr_list, &chosen->instr->node); if (debug) fprintf(stderr, "\n"); } } static uint32_t nir_schedule_get_delay(nir_instr *instr) { switch (instr->type) { case nir_instr_type_ssa_undef: case nir_instr_type_load_const: case nir_instr_type_alu: case nir_instr_type_deref: case nir_instr_type_jump: case nir_instr_type_parallel_copy: case nir_instr_type_call: case nir_instr_type_phi: return 1; case nir_instr_type_intrinsic: /* XXX: Pick a large number for UBO/SSBO/image/shared loads */ return 1; case nir_instr_type_tex: /* Pick some large number to try to fetch textures early and sample them * late. */ return 100; } return 0; } static void nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state) { nir_schedule_node *n = (nir_schedule_node *)node; uint32_t max_delay = 0; util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { nir_schedule_node *child = (nir_schedule_node *)edge->child; max_delay = MAX2(child->max_delay, max_delay); } n->max_delay = MAX2(n->max_delay, max_delay + n->delay); } static void nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block) { void *mem_ctx = ralloc_context(NULL); scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx); scoreboard->dag = dag_create(mem_ctx); nir_foreach_instr(instr, block) { nir_schedule_node *n = rzalloc(mem_ctx, nir_schedule_node); n->instr = instr; n->delay = nir_schedule_get_delay(instr); dag_init_node(scoreboard->dag, &n->dag); _mesa_hash_table_insert(scoreboard->instr_map, instr, n); } calculate_forward_deps(scoreboard, block); calculate_reverse_deps(scoreboard, block); dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL); nir_schedule_instructions(scoreboard, block); ralloc_free(mem_ctx); scoreboard->instr_map = NULL; } static bool nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state) { nir_schedule_scoreboard *scoreboard = state; struct set *def_uses = _mesa_pointer_set_create(scoreboard); _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses); _mesa_set_add(def_uses, def->parent_instr); nir_foreach_use(src, def) { _mesa_set_add(def_uses, src->parent_instr); } /* XXX: Handle if uses */ return true; } static nir_schedule_scoreboard * nir_schedule_get_scoreboard(nir_shader *shader, const nir_schedule_options *options) { nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard); scoreboard->shader = shader; scoreboard->live_values = _mesa_pointer_set_create(scoreboard); scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard); scoreboard->options = options; scoreboard->pressure = 0; nir_foreach_function(function, shader) { nir_foreach_register(reg, &function->impl->registers) { struct set *register_uses = _mesa_pointer_set_create(scoreboard); _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses); nir_foreach_use(src, reg) { _mesa_set_add(register_uses, src->parent_instr); } /* XXX: Handle if uses */ nir_foreach_def(dest, reg) { _mesa_set_add(register_uses, dest->reg.parent_instr); } } nir_foreach_block(block, function->impl) { nir_foreach_instr(instr, block) { nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard, scoreboard); } /* XXX: We're ignoring if uses, which may prioritize scheduling other * uses of the if src even when it doesn't help. That's not many * values, though, so meh. */ } } return scoreboard; } static void nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard) { #ifdef NDEBUG return; #endif bool any_uses = false; hash_table_foreach(scoreboard->remaining_uses, entry) { struct set *remaining_uses = entry->data; set_foreach(remaining_uses, instr_entry) { if (!any_uses) { fprintf(stderr, "Tracked uses remain after scheduling. " "Affected instructions: \n"); any_uses = true; } nir_print_instr(instr_entry->key, stderr); fprintf(stderr, "\n"); } } assert(!any_uses); } /** * Schedules the NIR instructions to try to decrease stalls (for example, * delaying texture reads) while managing register pressure. * * The threshold represents "number of NIR register/SSA def channels live * before switching the scheduling heuristic to reduce register pressure", * since most of our GPU architectures are scalar (extending to vector with a * flag wouldn't be hard). This number should be a bit below the number of * registers available (counting any that may be occupied by system value * payload values, for example), since the heuristic may not always be able to * free a register immediately. The amount below the limit is up to you to * tune. */ void nir_schedule(nir_shader *shader, const nir_schedule_options *options) { nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader, options); if (debug) { fprintf(stderr, "NIR shader before scheduling:\n"); nir_print_shader(shader, stderr); } nir_foreach_function(function, shader) { if (!function->impl) continue; nir_foreach_block(block, function->impl) { nir_schedule_block(scoreboard, block); } } nir_schedule_validate_uses(scoreboard); ralloc_free(scoreboard); }