bolt/lib/Passes/ReorderAlgorithm.cpp - third_party/llvm-project - Git at Google

 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements classes used by several basic block reordering
 // algorithms.
 //
 //===----------------------------------------------------------------------===//

 #include "bolt/Passes/ReorderAlgorithm.h"
 #include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/CodeLayout.h"
 #include <queue>
 #include <random>
 #include <stack>

 #undef DEBUG_TYPE
 #define DEBUG_TYPE "bolt"

 using namespace llvm;
 using namespace bolt;

 namespace opts {

 extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<bool> NoThreads;

 static cl::opt<unsigned> ColdThreshold(
     "cold-threshold",
     cl::desc("tenths of percents of main entry frequency to use as a "
              "threshold when evaluating whether a basic block is cold "
              "(0 means it is only considered cold if the block has zero "
              "samples). Default: 0 "),
     cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));

 static cl::opt<bool> PrintClusters("print-clusters", cl::desc("print clusters"),
                                    cl::Hidden, cl::cat(BoltOptCategory));

 cl::opt<uint32_t> RandomSeed("bolt-seed", cl::desc("seed for randomization"),
                              cl::init(42), cl::Hidden,
                              cl::cat(BoltOptCategory));

 } // namespace opts

 namespace {

 template <class T> inline void hashCombine(size_t &Seed, const T &Val) {
   std::hash<T> Hasher;
   Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
 }

 template <typename A, typename B> struct HashPair {
   size_t operator()(const std::pair<A, B> &Val) const {
     std::hash<A> Hasher;
     size_t Seed = Hasher(Val.first);
     hashCombine(Seed, Val.second);
     return Seed;
   }
 };

 } // namespace

 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
   // Create a separate MCCodeEmitter to allow lock-free execution
   BinaryContext::IndependentCodeEmitter Emitter;
   if (!opts::NoThreads)
     Emitter = BC.createIndependentMCCodeEmitter();

   AvgFreq.resize(Clusters.size(), 0.0);
   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
     double Freq = 0.0;
     uint64_t ClusterSize = 0;
     for (const BinaryBasicBlock *BB : Clusters[I]) {
       if (BB->getNumNonPseudos() > 0) {
         Freq += BB->getExecutionCount();
         // Estimate the size of a block in bytes at run time
         // NOTE: This might be inaccurate
         ClusterSize += BB->estimateSize(Emitter.MCE.get());
       }
     }
     AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
   }
 }

 void ClusterAlgorithm::printClusters() const {
   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
     errs() << "Cluster number " << I;
     if (AvgFreq.size() == Clusters.size())
       errs() << " (frequency: " << AvgFreq[I] << ")";
     errs() << " : ";
     const char *Sep = "";
     for (const BinaryBasicBlock *BB : Clusters[I]) {
       errs() << Sep << BB->getName();
       Sep = ", ";
     }
     errs() << "\n";
   }
 }

 void ClusterAlgorithm::reset() {
   Clusters.clear();
   ClusterEdges.clear();
   AvgFreq.clear();
 }

 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
   OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
 }

 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
   HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
   return Hasher(std::make_pair(E.Src, E.Dst));
 }

 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A,
                                                    const EdgeTy &B) const {
   return A.Src == B.Src && A.Dst == B.Dst;
 }

 void GreedyClusterAlgorithm::clusterBasicBlocks(BinaryFunction &BF,
                                                 bool ComputeEdges) {
   reset();

   // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
   // maximize weight during a path traversing all BBs. In this way, we will
   // convert the hottest branches into fall-throughs.

   // This is the queue of edges from which we will pop edges and use them to
   // cluster basic blocks in a greedy fashion.
   std::vector<EdgeTy> Queue;

   // Initialize inter-cluster weights.
   if (ComputeEdges)
     ClusterEdges.resize(BF.getLayout().block_size());

   // Initialize clusters and edge queue.
   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
     // Create a cluster for this BB.
     uint32_t I = Clusters.size();
     Clusters.emplace_back();
     std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
     Cluster.push_back(BB);
     BBToClusterMap[BB] = I;
     // Populate priority queue with edges.
     auto BI = BB->branch_info_begin();
     for (const BinaryBasicBlock *I : BB->successors()) {
       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
              "attempted reordering blocks of function with no profile data");
       Queue.emplace_back(EdgeTy(BB, I, BI->Count));
       ++BI;
     }
   }
   // Sort and adjust the edge queue.
   initQueue(Queue, BF);

   // Grow clusters in a greedy fashion.
   while (!Queue.empty()) {
     EdgeTy E = Queue.back();
     Queue.pop_back();

     const BinaryBasicBlock *SrcBB = E.Src;
     const BinaryBasicBlock *DstBB = E.Dst;

     LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");

     // Case 1: BBSrc and BBDst are the same. Ignore this edge
     if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) {
       LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
       continue;
     }

     int I = BBToClusterMap[SrcBB];
     int J = BBToClusterMap[DstBB];

     // Case 2: If they are already allocated at the same cluster, just increase
     // the weight of this cluster
     if (I == J) {
       if (ComputeEdges)
         ClusterEdges[I][I] += E.Count;
       LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
       continue;
     }

     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
     if (areClustersCompatible(ClusterA, ClusterB, E)) {
       // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
       // allowing us to merge two clusters.
       for (const BinaryBasicBlock *BB : ClusterB)
         BBToClusterMap[BB] = I;
       ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
       ClusterB.clear();
       if (ComputeEdges) {
         // Increase the intra-cluster edge count of cluster A with the count of
         // this edge as well as with the total count of previously visited edges
         // from cluster B cluster A.
         ClusterEdges[I][I] += E.Count;
         ClusterEdges[I][I] += ClusterEdges[J][I];
         // Iterate through all inter-cluster edges and transfer edges targeting
         // cluster B to cluster A.
         for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
           ClusterEdges[K][I] += ClusterEdges[K][J];
       }
       // Adjust the weights of the remaining edges and re-sort the queue.
       adjustQueue(Queue, BF);
       LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
     } else {
       // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
       // merge them. Add the count of this edge to the inter-cluster edge count
       // between clusters A and B to help us decide ordering between these
       // clusters.
       if (ComputeEdges)
         ClusterEdges[I][J] += E.Count;
       LLVM_DEBUG(
           dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
     }
   }
 }

 void GreedyClusterAlgorithm::reset() {
   ClusterAlgorithm::reset();
   BBToClusterMap.clear();
 }

 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
                                          const BinaryFunction &BF) {
   // Define a comparison function to establish SWO between edges.
   auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) {
     // With equal weights, prioritize branches with lower index
     // source/destination. This helps to keep original block order for blocks
     // when optimal order cannot be deducted from a profile.
     if (A.Count == B.Count) {
       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
       return (SrcOrder != 0)
                  ? SrcOrder > 0
                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
     }
     return A.Count < B.Count;
   };

   // Sort edges in increasing profile count order.
   llvm::sort(Queue, Comp);
 }

 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
                                            const BinaryFunction &BF) {
   // Nothing to do.
 }

 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front,
                                                      const ClusterTy &Back,
                                                      const EdgeTy &E) const {
   return Front.back() == E.Src && Back.front() == E.Dst;
 }

 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
     const EdgeTy &E, const BinaryFunction &BF) const {
   const BinaryBasicBlock *SrcBB = E.Src;
   const BinaryBasicBlock *DstBB = E.Dst;

   // Initial weight value.
   int64_t W = (int64_t)E.Count;

   // Adjust the weight by taking into account other edges with the same source.
   auto BI = SrcBB->branch_info_begin();
   for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
            "attempted reordering blocks of function with no profile data");
     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
            "overflow detected");
     // Ignore edges with same source and destination, edges that target the
     // entry block as well as the edge E itself.
     if (SuccBB != SrcBB && SuccBB != *BF.getLayout().block_begin() &&
         SuccBB != DstBB)
       W -= (int64_t)BI->Count;
     ++BI;
   }

   // Adjust the weight by taking into account other edges with the same
   // destination.
   for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
     // Ignore edges with same source and destination as well as the edge E
     // itself.
     if (PredBB == DstBB || PredBB == SrcBB)
       continue;
     auto BI = PredBB->branch_info_begin();
     for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
       if (SuccBB == DstBB)
         break;
       ++BI;
     }
     assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
            "attempted reordering blocks of function with no profile data");
     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
            "overflow detected");
     W -= (int64_t)BI->Count;
   }

   return W;
 }

 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
                                                 const BinaryFunction &BF) {
   // Initialize edge weights.
   for (const EdgeTy &E : Queue)
     Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));

   // Sort edges in increasing weight order.
   adjustQueue(Queue, BF);
 }

 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
                                                   const BinaryFunction &BF) {
   // Define a comparison function to establish SWO between edges.
   auto Comp = [&](const EdgeTy &A, const EdgeTy &B) {
     // With equal weights, prioritize branches with lower index
     // source/destination. This helps to keep original block order for blocks
     // when optimal order cannot be deduced from a profile.
     if (Weight[A] == Weight[B]) {
       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
       return (SrcOrder != 0)
                  ? SrcOrder > 0
                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
     }
     return Weight[A] < Weight[B];
   };

   // Iterate through all remaining edges to find edges that have their
   // source and destination in the same cluster.
   std::vector<EdgeTy> NewQueue;
   for (const EdgeTy &E : Queue) {
     const BinaryBasicBlock *SrcBB = E.Src;
     const BinaryBasicBlock *DstBB = E.Dst;

     // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
     // this edge.
     if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) {
       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
                  dbgs() << " (same src, dst)\n");
       continue;
     }

     int I = BBToClusterMap[SrcBB];
     int J = BBToClusterMap[DstBB];
     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];

     // Case 2: They are already allocated at the same cluster or incompatible
     // clusters. Adjust the weights of edges with the same source or
     // destination, so that this edge has no effect on them any more, and ignore
     // this edge. Also increase the intra- (or inter-) cluster edge count.
     if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
       if (!ClusterEdges.empty())
         ClusterEdges[I][J] += E.Count;
       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
                  dbgs() << " (src, dst belong to same cluster or incompatible "
                            "clusters)\n");
       for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
         if (SuccBB == DstBB)
           continue;
         auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
         assert(WI != Weight.end() && "CFG edge not found in Weight map");
         WI->second += (int64_t)E.Count;
       }
       for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
         if (PredBB == SrcBB)
           continue;
         auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
         assert(WI != Weight.end() && "CFG edge not found in Weight map");
         WI->second += (int64_t)E.Count;
       }
       continue;
     }

     // Case 3: None of the previous cases is true, so just keep this edge in
     // the queue.
     NewQueue.emplace_back(E);
   }

   // Sort remaining edges in increasing weight order.
   Queue.swap(NewQueue);
   llvm::sort(Queue, Comp);
 }

 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
     const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
   return Front.back() == E.Src && Back.front() == E.Dst;
 }

 void MinBranchGreedyClusterAlgorithm::reset() {
   GreedyClusterAlgorithm::reset();
   Weight.clear();
 }

 void TSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
                                              BasicBlockOrder &Order) const {
   std::vector<std::vector<uint64_t>> Weight;
   std::vector<BinaryBasicBlock *> IndexToBB;

   const size_t N = BF.getLayout().block_size();
   assert(N <= std::numeric_limits<uint64_t>::digits &&
          "cannot use TSP solution for sizes larger than bits in uint64_t");

   // Populating weight map and index map
   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
     BB->setLayoutIndex(IndexToBB.size());
     IndexToBB.push_back(BB);
   }
   Weight.resize(N);
   for (const BinaryBasicBlock *BB : BF.getLayout().blocks()) {
     auto BI = BB->branch_info_begin();
     Weight[BB->getLayoutIndex()].resize(N);
     for (BinaryBasicBlock *SuccBB : BB->successors()) {
       if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
         Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
       ++BI;
     }
   }

   std::vector<std::vector<int64_t>> DP;
   DP.resize(static_cast<size_t>(1) << N);
   for (std::vector<int64_t> &Elmt : DP)
     Elmt.resize(N, -1);

   // Start with the entry basic block being allocated with cost zero
   DP[1][0] = 0;
   // Walk through TSP solutions using a bitmask to represent state (current set
   // of BBs in the layout)
   uint64_t BestSet = 1;
   uint64_t BestLast = 0;
   int64_t BestWeight = 0;
   for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
     // Traverse each possibility of Last BB visited in this layout
     for (uint64_t Last = 0; Last < N; ++Last) {
       // Case 1: There is no possible layout with this BB as Last
       if (DP[Set][Last] == -1)
         continue;

       // Case 2: There is a layout with this Set and this Last, and we try
       // to expand this set with New
       for (uint64_t New = 1; New < N; ++New) {
         // Case 2a: BB "New" is already in this Set
         if ((Set & (1ULL << New)) != 0)
           continue;

         // Case 2b: BB "New" is not in this set and we add it to this Set and
         // record total weight of this layout with "New" as the last BB.
         uint64_t NewSet = (Set | (1ULL << New));
         if (DP[NewSet][New] == -1)
           DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
         DP[NewSet][New] = std::max(DP[NewSet][New],
                                    DP[Set][Last] + (int64_t)Weight[Last][New]);

         if (DP[NewSet][New] > BestWeight) {
           BestWeight = DP[NewSet][New];
           BestSet = NewSet;
           BestLast = New;
         }
       }
     }
   }

   // Define final function layout based on layout that maximizes weight
   uint64_t Last = BestLast;
   uint64_t Set = BestSet;
   BitVector Visited;
   Visited.resize(N);
   Visited[Last] = true;
   Order.push_back(IndexToBB[Last]);
   Set = Set & ~(1ULL << Last);
   while (Set != 0) {
     int64_t Best = -1;
     uint64_t NewLast;
     for (uint64_t I = 0; I < N; ++I) {
       if (DP[Set][I] == -1)
         continue;
       int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
       if (DP[Set][I] + AdjWeight > Best) {
         NewLast = I;
         Best = DP[Set][I] + AdjWeight;
       }
     }
     Last = NewLast;
     Visited[Last] = true;
     Order.push_back(IndexToBB[Last]);
     Set = Set & ~(1ULL << Last);
   }
   std::reverse(Order.begin(), Order.end());

   // Finalize layout with BBs that weren't assigned to the layout using the
   // input layout.
   for (BinaryBasicBlock *BB : BF.getLayout().blocks())
     if (Visited[BB->getLayoutIndex()] == false)
       Order.push_back(BB);
 }

 void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
                                                 BasicBlockOrder &Order) const {
   if (BF.getLayout().block_empty())
     return;

   // Do not change layout of functions w/o profile information
   if (!BF.hasValidProfile() || BF.getLayout().block_size() <= 2) {
     for (BinaryBasicBlock *BB : BF.getLayout().blocks())
       Order.push_back(BB);
     return;
   }

   // Create a separate MCCodeEmitter to allow lock-free execution
   BinaryContext::IndependentCodeEmitter Emitter;
   if (!opts::NoThreads)
     Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();

   // Initialize CFG nodes and their data
   std::vector<uint64_t> BlockSizes;
   std::vector<uint64_t> BlockCounts;
   BasicBlockOrder OrigOrder;
   BF.getLayout().updateLayoutIndices();
   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
     uint64_t Size = std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1);
     BlockSizes.push_back(Size);
     BlockCounts.push_back(BB->getKnownExecutionCount());
     OrigOrder.push_back(BB);
   }

   // Initialize CFG edges
   std::vector<codelayout::EdgeCount> JumpCounts;
   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
     auto BI = BB->branch_info_begin();
     for (BinaryBasicBlock *SuccBB : BB->successors()) {
       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
              "missing profile for a jump");
       JumpCounts.push_back(
           {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count});
       ++BI;
     }
   }

   // Run the layout algorithm
   auto Result =
       codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
   Order.reserve(BF.getLayout().block_size());
   for (uint64_t R : Result)
     Order.push_back(OrigOrder[R]);
 }

 void OptimizeReorderAlgorithm::reorderBasicBlocks(
     BinaryFunction &BF, BasicBlockOrder &Order) const {
   if (BF.getLayout().block_empty())
     return;

   // Cluster basic blocks.
   CAlgo->clusterBasicBlocks(BF);

   if (opts::PrintClusters)
     CAlgo->printClusters();

   // Arrange basic blocks according to clusters.
   for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
 }

 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
     BinaryFunction &BF, BasicBlockOrder &Order) const {
   if (BF.getLayout().block_empty())
     return;

   // Cluster basic blocks.
   CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true);
   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
   std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
       CAlgo->ClusterEdges;

   // Compute clusters' average frequencies.
   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
   std::vector<double> &AvgFreq = CAlgo->AvgFreq;

   if (opts::PrintClusters)
     CAlgo->printClusters();

   // Cluster layout order
   std::vector<uint32_t> ClusterOrder;

   // Do a topological sort for clusters, prioritizing frequently-executed BBs
   // during the traversal.
   std::stack<uint32_t> Stack;
   std::vector<uint32_t> Status;
   std::vector<uint32_t> Parent;
   Status.resize(Clusters.size(), 0);
   Parent.resize(Clusters.size(), 0);
   constexpr uint32_t STACKED = 1;
   constexpr uint32_t VISITED = 2;
   Status[0] = STACKED;
   Stack.push(0);
   while (!Stack.empty()) {
     uint32_t I = Stack.top();
     if (!(Status[I] & VISITED)) {
       Status[I] |= VISITED;
       // Order successors by weight
       auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
         return ClusterEdges[I][A] > ClusterEdges[I][B];
       };
       std::priority_queue<uint32_t, std::vector<uint32_t>,
                           decltype(ClusterComp)>
           SuccQueue(ClusterComp);
       for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
         if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
             !Clusters[Target.first].empty()) {
           Parent[Target.first] = I;
           Status[Target.first] = STACKED;
           SuccQueue.push(Target.first);
         }
       }
       while (!SuccQueue.empty()) {
         Stack.push(SuccQueue.top());
         SuccQueue.pop();
       }
       continue;
     }
     // Already visited this node
     Stack.pop();
     ClusterOrder.push_back(I);
   }
   std::reverse(ClusterOrder.begin(), ClusterOrder.end());
   // Put unreachable clusters at the end
   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
     if (!(Status[I] & VISITED) && !Clusters[I].empty())
       ClusterOrder.push_back(I);

   // Sort nodes with equal precedence
   auto Beg = ClusterOrder.begin();
   // Don't reorder the first cluster, which contains the function entry point
   ++Beg;
   std::stable_sort(Beg, ClusterOrder.end(),
                    [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
                      uint32_t P = Parent[A];
                      while (Parent[P] != 0) {
                        if (Parent[P] == B)
                          return false;
                        P = Parent[P];
                      }
                      P = Parent[B];
                      while (Parent[P] != 0) {
                        if (Parent[P] == A)
                          return true;
                        P = Parent[P];
                      }
                      return AvgFreq[A] > AvgFreq[B];
                    });

   if (opts::PrintClusters) {
     errs() << "New cluster order: ";
     const char *Sep = "";
     for (uint32_t O : ClusterOrder) {
       errs() << Sep << O;
       Sep = ", ";
     }
     errs() << '\n';
   }

   // Arrange basic blocks according to cluster order.
   for (uint32_t ClusterIndex : ClusterOrder) {
     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
   }
 }

 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
     BinaryFunction &BF, BasicBlockOrder &Order) const {
   if (BF.getLayout().block_empty())
     return;

   const uint64_t ColdThreshold =
       opts::ColdThreshold *
       (*BF.getLayout().block_begin())->getExecutionCount() / 1000;

   // Cluster basic blocks.
   CAlgo->clusterBasicBlocks(BF);
   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;

   // Compute clusters' average frequencies.
   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
   std::vector<double> &AvgFreq = CAlgo->AvgFreq;

   if (opts::PrintClusters)
     CAlgo->printClusters();

   // Cluster layout order
   std::vector<uint32_t> ClusterOrder;

   // Order clusters based on average instruction execution frequency
   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
     if (!Clusters[I].empty())
       ClusterOrder.push_back(I);
   // Don't reorder the first cluster, which contains the function entry point
   std::stable_sort(
       std::next(ClusterOrder.begin()), ClusterOrder.end(),
       [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; });

   if (opts::PrintClusters) {
     errs() << "New cluster order: ";
     const char *Sep = "";
     for (uint32_t O : ClusterOrder) {
       errs() << Sep << O;
       Sep = ", ";
     }
     errs() << '\n';
   }

   // Arrange basic blocks according to cluster order.
   for (uint32_t ClusterIndex : ClusterOrder) {
     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
     // Force zero execution count on clusters that do not meet the cut off
     // specified by --cold-threshold.
     if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold))
       for (BinaryBasicBlock *BBPtr : Cluster)
         BBPtr->setExecutionCount(0);
   }
 }

 void ReverseReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
                                                  BasicBlockOrder &Order) const {
   if (BF.getLayout().block_empty())
     return;

   BinaryBasicBlock *FirstBB = *BF.getLayout().block_begin();
   Order.push_back(FirstBB);
   for (auto RLI = BF.getLayout().block_rbegin(); *RLI != FirstBB; ++RLI)
     Order.push_back(*RLI);
 }

 void RandomClusterReorderAlgorithm::reorderBasicBlocks(
     BinaryFunction &BF, BasicBlockOrder &Order) const {
   if (BF.getLayout().block_empty())
     return;

   // Cluster basic blocks.
   CAlgo->clusterBasicBlocks(BF);
   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;

   if (opts::PrintClusters)
     CAlgo->printClusters();

   // Cluster layout order
   std::vector<uint32_t> ClusterOrder;

   // Order clusters based on average instruction execution frequency
   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
     if (!Clusters[I].empty())
       ClusterOrder.push_back(I);

   std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(),
                std::default_random_engine(opts::RandomSeed.getValue()));

   if (opts::PrintClusters) {
     errs() << "New cluster order: ";
     const char *Sep = "";
     for (uint32_t O : ClusterOrder) {
       errs() << Sep << O;
       Sep = ", ";
     }
     errs() << '\n';
   }

   // Arrange basic blocks according to cluster order.
   for (uint32_t ClusterIndex : ClusterOrder) {
     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
   }
 }