lib/SILOptimizer/Transforms/RedundantLoadElimination.cpp - third_party/swift - Git at Google

 //===--- RedundantLoadElimination.cpp - SIL Load Forwarding ---------------===//
 //
 // This source file is part of the Swift.org open source project
 //
 // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
 // Licensed under Apache License v2.0 with Runtime Library Exception
 //
 // See https://swift.org/LICENSE.txt for license information
 // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 ///
 /// This pass eliminates redundant loads.
 ///
 /// A load can be eliminated if its value has already been held somewhere,
 /// i.e. generated by a previous load, LSLocation stored by a known value.
 ///
 /// In this case, one can replace the load instruction with the previous
 /// results.
 ///
 /// Redundant Load Elimination (RLE) eliminates such loads by:
 ///
 /// 1. Introducing a notion of a LSLocation that is used to model object
 /// fields. (See below for more details).
 ///
 /// 2. Introducing a notion of a LSValue that is used to model the value
 /// that currently resides in the associated LSLocation on the particular
 /// program path. (See below for more details).
 ///
 /// 3. Performing a RPO walk over the control flow graph, tracking any
 /// LSLocations that are read from or stored into in each basic block. The
 /// read or stored value, kept in a map between LSLocation and LSValue,
 /// becomes the available value for the LSLocation.
 ///
 /// 4. An optimistic iterative intersection-based dataflow is performed on the
 /// gensets until convergence.
 ///
 /// At the core of RLE, there is the LSLocation class. A LSLocation is an
 /// abstraction of an object field in program. It consists of a base and a
 /// projection path to the field accessed.
 ///
 /// In SIL, one can access an aggregate as a whole, i.e. store to a struct with
 /// 2 Int fields. A store like this will generate 2 *indivisible* LSLocations,
 /// 1 for each field and in addition to keeping a list of LSLocation, RLE also
 /// keeps their available LSValues. We call it *indivisible* because it
 /// cannot be broken down to more LSLocations.
 ///
 /// LSValue consists of a base - a SILValue from the load or store inst,
 /// as well as a projection path to which the field it represents. So, a
 /// store to an 2-field struct as mentioned above will generate 2 LSLocations
 /// and 2 LSValues.
 ///
 /// Every basic block keeps a map between LSLocation and LSValue. By
 /// keeping the LSLocation and LSValue in their indivisible form, one
 /// can easily find which part of the load is redundant and how to compute its
 /// forwarding value.
 ///
 /// Given the case which the 2 fields of the struct both have available values,
 /// RLE can find their LSValues (maybe by struct_extract from a larger
 /// value) and then aggregate them.
 ///
 /// However, this may introduce a lot of extraction and aggregation which may
 /// not be necessary. i.e. a store the struct followed by a load from the
 /// struct. To solve this problem, when RLE detects that a load instruction
 /// can be replaced by forwarded value, it will try to find minimum # of
 /// extractions necessary to form the forwarded value. It will group the
 /// available value's by the LSValue base, i.e. the LSValues come from the
 /// same instruction, and then use extraction to obtain the needed components
 /// of the base.
 ///
 //===----------------------------------------------------------------------===//

 #define DEBUG_TYPE "sil-redundant-load-elim"
 #include "swift/SIL/Projection.h"
 #include "swift/SIL/SILArgument.h"
 #include "swift/SIL/SILBuilder.h"
 #include "swift/SILOptimizer/Analysis/AliasAnalysis.h"
 #include "swift/SILOptimizer/Analysis/ARCAnalysis.h"
 #include "swift/SILOptimizer/Analysis/DominanceAnalysis.h"
 #include "swift/SILOptimizer/Analysis/PostOrderAnalysis.h"
 #include "swift/SILOptimizer/Analysis/ValueTracking.h"
 #include "swift/SILOptimizer/PassManager/Passes.h"
 #include "swift/SILOptimizer/PassManager/Transforms.h"
 #include "swift/SILOptimizer/Utils/CFG.h"
 #include "swift/SILOptimizer/Utils/Local.h"
 #include "swift/SILOptimizer/Utils/LoadStoreOptUtils.h"
 #include "swift/SILOptimizer/Utils/SILSSAUpdater.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"

 using namespace swift;

 STATISTIC(NumForwardedLoads, "Number of loads forwarded");

 /// Return the deallocate stack instructions corresponding to the given
 /// AllocStackInst.
 static SILInstruction *findAllocStackInst(SILInstruction *I) {
   if (DeallocStackInst *DSI = dyn_cast<DeallocStackInst>(I))
     return dyn_cast<SILInstruction>(DSI->getOperand());
   return nullptr;
 }

 /// ComputeAvailSetMax - If we ignore all unknown writes, what is the max
 /// available set that can reach the a certain point in a basic block. This
 /// helps generating the genset and killset. i.e. if there is no downward visible
 /// value that can reach the end of a basic block, then we know that the genset
 /// and killset for the location need not be set.
 ///
 /// ComputeAvailGenKillSet - Build the genset and killset of the basic block.
 ///
 /// ComputeAvailValue - Compute the available value at the end of the basic
 /// block.
 ///
 /// PerformRLE - Perform the actual redundant load elimination.
 enum class RLEKind : unsigned {
   ComputeAvailSetMax = 0,
   ComputeAvailGenKillSet = 1,
   ComputeAvailValue = 2,
   PerformRLE = 3,
 };

 //===----------------------------------------------------------------------===//
 //                             Utility Functions
 //===----------------------------------------------------------------------===//

 static bool inline isComputeAvailSetMax(RLEKind Kind) {
   return Kind == RLEKind::ComputeAvailSetMax;
 }

 static bool inline isComputeAvailGenKillSet(RLEKind Kind) {
   return Kind == RLEKind::ComputeAvailGenKillSet;
 }

 static bool inline isComputeAvailValue(RLEKind Kind) {
   return Kind == RLEKind::ComputeAvailValue;
 }

 static bool inline isPerformingRLE(RLEKind Kind) {
   return Kind == RLEKind::PerformRLE;
 }

 /// Returns true if this is an instruction that may have side effects in a
 /// general sense but are inert from a load store perspective.
 static bool isRLEInertInstruction(SILInstruction *Inst) {
   switch (Inst->getKind()) {
   case ValueKind::StrongRetainInst:
   case ValueKind::StrongRetainUnownedInst:
   case ValueKind::UnownedRetainInst:
   case ValueKind::RetainValueInst:
   case ValueKind::DeallocStackInst:
   case ValueKind::CondFailInst:
   case ValueKind::IsUniqueInst:
   case ValueKind::IsUniqueOrPinnedInst:
   case ValueKind::FixLifetimeInst:
     return true;
   default:
     return false;
   }
 }

 //===----------------------------------------------------------------------===//
 //                       Basic Block Location State
 //===----------------------------------------------------------------------===//
 namespace {

 /// If this function has too many basic blocks or too many locations, it may
 /// take a long time to compute the genset and killset. The number of memory
 /// behavior or alias query we need to do in worst case is roughly linear to
 /// # of BBs x(times) # of locations.
 ///
 /// we could run RLE on functions with 128 basic blocks and 128 locations,
 /// which is a large function.
 constexpr unsigned MaxLSLocationBBMultiplicationNone = 128*128;

 /// we could run optimistic RLE on functions with less than 64 basic blocks
 /// and 64 locations which is a sizable function.
 constexpr unsigned MaxLSLocationBBMultiplicationPessimistic = 64*64;

 /// forward declaration.
 class RLEContext;

 /// State of the load store in one basic block which allows for forwarding from
 /// loads, stores -> loads
 class BlockState {
 public:
   enum class ValueState : unsigned {
     CoverValues = 0,
     ConcreteValues = 1,
     CoverAndConcreteValues = 2,
   };

 private:
   /// # of locations in the LocationVault.
   unsigned LocationNum;

   /// The basic block that we are optimizing.
   SILBasicBlock *BB;

   /// A bit vector for which the ith bit represents the ith LSLocation in
   /// LocationVault. If the bit is set, then the location currently has an
   /// downward visible value at the beginning of the basic block.
   llvm::SmallBitVector ForwardSetIn;

   /// A bit vector for which the ith bit represents the ith LSLocation in
   /// LocationVault. If the bit is set, then the location currently has an
   /// downward visible value at the end of the basic block.
   llvm::SmallBitVector ForwardSetOut;

   /// A bit vector for which the ith bit represents the ith LSLocation in
   /// LocationVault. If we ignore all unknown write, what's the maximum set
   /// of available locations at the current position in the basic block.
   llvm::SmallBitVector ForwardSetMax;

   /// A bit vector for which the ith bit represents the ith LSLocation in
   /// LocationVault. If the bit is set, then the basic block generates a
   /// value for the location.
   llvm::SmallBitVector BBGenSet;

   /// A bit vector for which the ith bit represents the ith LSLocation in
   /// LocationVault. If the bit is set, then the basic block kills the
   /// value for the location.
   llvm::SmallBitVector BBKillSet;

   /// This is map between LSLocations and their available values at the
   /// beginning of this basic block.
   ValueTableMap ForwardValIn;

   /// This is map between LSLocations and their available values at the end of
   /// this basic block.
   ValueTableMap ForwardValOut;

   /// Keeps a list of replaceable instructions in the current basic block as
   /// well as their SILValue replacement.
   llvm::DenseMap<SILInstruction *, SILValue> RedundantLoads;

   /// LSLocation read or written has been extracted, expanded and mapped to the
   /// bit position in the bitvector. Update it in the ForwardSetIn of the
   /// current basic block.
   void updateForwardSetForRead(RLEContext &Ctx, unsigned B);
   void updateForwardSetForWrite(RLEContext &Ctx, unsigned B);

   /// LSLocation read or written has been extracted, expanded and mapped to the
   /// B position in the Bvector. Update it in the genset and killset of the
   /// current basic block.
   void updateGenKillSetForRead(RLEContext &Ctx, unsigned B);
   void updateGenKillSetForWrite(RLEContext &Ctx, unsigned B);

   /// LSLocation read or written has been extracted, expanded and mapped to the
   /// B position in the Bvector. Update it in the MaxAvailForwardSet of the
   /// current basic block.
   void updateMaxAvailSetForRead(RLEContext &Ctx, unsigned B);
   void updateMaxAvailSetForWrite(RLEContext &Ctx, unsigned B);

   /// LSLocation written has been extracted, expanded and mapped to the bit
   /// position in the bitvector. process it using the bit position.
   void updateForwardSetAndValForRead(RLEContext &Ctx, unsigned L, unsigned V);
   void updateForwardSetAndValForWrite(RLEContext &Ctx, unsigned L, unsigned V);

   /// There is a read to a LSLocation, expand the LSLocation into individual
   /// fields before processing them.
   void processRead(RLEContext &Ctx, SILInstruction *I, SILValue Mem,
                    SILValue Val, RLEKind Kind);

   /// There is a write to a LSLocation, expand the LSLocation into individual
   /// fields before processing them.
   void processWrite(RLEContext &Ctx, SILInstruction *I, SILValue Mem,
                     SILValue Val, RLEKind Kind);

   /// BitVector manipulation functions.
   void startTrackingLocation(llvm::SmallBitVector &BV, unsigned B);
   void stopTrackingLocation(llvm::SmallBitVector &BV, unsigned B);
   bool isTrackingLocation(llvm::SmallBitVector &BV, unsigned B);
   void startTrackingValue(ValueTableMap &VM, unsigned L, unsigned V);
   void stopTrackingValue(ValueTableMap &VM, unsigned B);

 public:
   BlockState() = default;

   void init(SILBasicBlock *NewBB, unsigned bitcnt, bool optimistic) {
     BB = NewBB;
     LocationNum = bitcnt;
     // For reachable basic blocks, the initial state of ForwardSetOut should be
     // all 1's. Otherwise the dataflow solution could be too conservative.
     //
     // Consider this case, the forwardable value by var a = 10 before the loop
     // will not be forwarded if the ForwardSetOut is set to 0 initially.
     //
     //   var a = 10
     //   for _ in 0...1024 {}
     //   use(a);
     //
     // However, by doing so, we can only do the data forwarding after the
     // data flow stabilizes.
     //
     // We set the initial state of unreachable block to 0, as we do not have
     // a value for the location.
     //
     // This is a bit conservative as we could be missing forwarding
     // opportunities. i.e. a joint block with 1 predecessor being an
     // unreachable block.
     //
     // we rely on other passes to clean up unreachable block.
     ForwardSetIn.resize(LocationNum, false);
     ForwardSetOut.resize(LocationNum, optimistic);

     // If we are running an optimistic data flow, set forward max to true
     // initially.
     ForwardSetMax.resize(LocationNum, optimistic);

     BBGenSet.resize(LocationNum, false);
     BBKillSet.resize(LocationNum, false);
   }

   /// Initialize the AvailSetMax by intersecting this basic block's
   /// predecessors' AvailSetMax.
   void mergePredecessorsAvailSetMax(RLEContext &Ctx);

   /// Initialize the AvailSet by intersecting this basic block' predecessors'
   /// AvailSet.
   void mergePredecessorAvailSet(RLEContext &Ctx);

   /// Initialize the AvailSet and AvailVal of the current basic block.
   void mergePredecessorAvailSetAndValue(RLEContext &Ctx);

   /// Reached the end of the basic block, update the ForwardValOut with the
   /// ForwardValIn.
   void updateForwardValOut() { ForwardValOut = ForwardValIn; }

   /// Check whether the ForwardSetOut has changed. If it does, we need to
   /// rerun the data flow to reach fixed point.
   bool updateForwardSetOut() {
     if (ForwardSetIn == ForwardSetOut)
       return false;
     ForwardSetOut = ForwardSetIn;
     return true;
   }

   /// Returns the current basic block we are processing.
   SILBasicBlock *getBB() const { return BB; }

   /// Returns the ForwardValIn for the current basic block.
   ValueTableMap &getForwardValIn() { return ForwardValIn; }

   /// Returns the ForwardValOut for the current basic block.
   ValueTableMap &getForwardValOut() { return ForwardValOut; }

   /// Returns the redundant loads and their replacement in the currently basic
   /// block.
   llvm::DenseMap<SILInstruction *, SILValue> &getRL() { return RedundantLoads; }

   /// Look into the value for the given LSLocation at end of the basic block,
   /// return one of the three ValueState type.
   ValueState getValueStateAtEndOfBlock(RLEContext &Ctx, LSLocation &L);

   /// Wrappers to query the value state of the location in this BlockState.
   bool isCoverValues(RLEContext &Ctx, LSLocation &L) {
     return getValueStateAtEndOfBlock(Ctx, L) == ValueState::CoverValues;
   }
   bool isConcreteValues(RLEContext &Ctx, LSLocation &L) {
     return getValueStateAtEndOfBlock(Ctx, L) == ValueState::ConcreteValues;
   }

   /// Iterate over the instructions in the basic block in forward order and
   /// process them w.r.t. the given \p Kind.
   void processInstructionWithKind(RLEContext &Ctx, SILInstruction *I,
                                   RLEKind Kind);
   void processBasicBlockWithKind(RLEContext &Ctx, RLEKind Kind);

   /// Process the current basic block with the genset and killset. Return true
   /// if the ForwardSetOut changes.
   bool processBasicBlockWithGenKillSet();

   /// Set up the value for redundant load elimination.
   bool setupRLE(RLEContext &Ctx, SILInstruction *I, SILValue Mem);

   /// Process Instruction which writes to memory in an unknown way.
   void processUnknownWriteInst(RLEContext &Ctx, SILInstruction *I,
                                RLEKind Kind);
   void processUnknownWriteInstForGenKillSet(RLEContext &Ctx, SILInstruction *I);
   void processUnknownWriteInstForRLE(RLEContext &Ctx, SILInstruction *I);


   void processDeallocStackInst(RLEContext &Ctx, SILInstruction *I,
                                RLEKind Kind);
   void processDeallocStackInstForGenKillSet(RLEContext &Ctx, SILInstruction *I);
   void processDeallocStackInstForRLE(RLEContext &Ctx, SILInstruction *I);

   /// Process LoadInst. Extract LSLocations from LoadInst.
   void processLoadInst(RLEContext &Ctx, LoadInst *LI, RLEKind Kind);

   /// Process LoadInst. Extract LSLocations from StoreInst.
   void processStoreInst(RLEContext &Ctx, StoreInst *SI, RLEKind Kind);

   /// Returns a *single* forwardable SILValue for the given LSLocation right
   /// before the InsertPt instruction.
   SILValue reduceValuesAtEndOfBlock(RLEContext &Ctx, LSLocation &L);

 #ifndef NDEBUG
   void dump(RLEContext &Ctx);
 #endif
 };

 } // end anonymous namespace

 //===----------------------------------------------------------------------===//
 //                            RLEContext Interface
 //===----------------------------------------------------------------------===//

 namespace {

 using BBValueMap = llvm::DenseMap<SILBasicBlock *, SILValue>;

 /// This class stores global state that we use when computing redundant load and
 /// their replacement in each basic block.
 class RLEContext {
   enum class ProcessKind {
     ProcessMultipleIterations = 0,
     ProcessOneIteration = 1,
     ProcessNone = 2,
   };
 private:
   /// Function currently processing.
   SILFunction *Fn;

   /// The passmanager we are using.
   SILPassManager *PM;

   /// The alias analysis that we will use during all computations.
   AliasAnalysis *AA;

   /// The type expansion analysis we will use during all computations.
   TypeExpansionAnalysis *TE;

   /// The SSA updater we use to materialize covering values.
   SILSSAUpdater Updater;

   /// The range that we use to iterate over the post order and reverse post
   /// order of the given function.
   PostOrderFunctionInfo *PO;

   /// Epilogue release analysis.
   EpilogueARCFunctionInfo *EAFI;

   /// Keeps all the locations for the current function. The BitVector in each
   /// BlockState is then laid on top of it to keep track of which LSLocation
   /// has a downward available value.
   std::vector<LSLocation> LocationVault;

   /// Contains a map between LSLocation to their index in the LocationVault.
   /// Use for fast lookup.
   LSLocationIndexMap LocToBitIndex;

   /// Keeps a map between the accessed SILValue and the location.
   LSLocationBaseMap BaseToLocIndex;

   /// Keeps all the loadstorevalues for the current function. The BitVector in
   /// each g is then laid on top of it to keep track of which LSLocation
   /// has a downward available value.
   std::vector<LSValue> LSValueVault;

   /// Contains a map between LSLocation to their index in the LocationVault.
   /// Use for fast lookup.
   llvm::DenseMap<LSValue, unsigned> ValToBitIndex;

   /// A map from each BasicBlock to its BlockState.
   llvm::SmallDenseMap<SILBasicBlock *, BlockState, 16> BBToLocState;

   /// Keeps a list of basic blocks that have LoadInsts. If a basic block does
   /// not have LoadInst, we do not actually perform the last iteration where
   /// RLE is actually performed on the basic block.
   ///
   /// NOTE: This is never populated for functions which will only require 1
   /// data flow iteration. For function that requires more than 1 iteration of
   /// the data flow this is populated when the first time the functions is
   /// walked, i.e. when the we generate the genset and killset.
   llvm::DenseSet<SILBasicBlock *> BBWithLoads;

 #ifndef NDEBUG
   SILPrintContext printCtx;
 #endif

 public:
   RLEContext(SILFunction *F, SILPassManager *PM, AliasAnalysis *AA,
              TypeExpansionAnalysis *TE, PostOrderFunctionInfo *PO,
              EpilogueARCFunctionInfo *EAFI);

   RLEContext(const RLEContext &) = delete;
   RLEContext(RLEContext &&) = default;
   ~RLEContext() = default;

   /// Entry point to redundant load elimination.
   bool run();

   SILFunction *getFunction() const { return Fn; }

   /// Use a set of ad hoc rules to tell whether we should run a pessimistic
   /// one iteration data flow on the function.
   ProcessKind getProcessFunctionKind(unsigned LoadCount, unsigned StoreCount);

   /// Run the iterative data flow until convergence.
   void runIterativeRLE();

   /// Process the basic blocks for the gen and kill set.
   void processBasicBlocksForGenKillSet();

   /// Process the basic blocks with the gen and kill set.
   void processBasicBlocksWithGenKillSet();

   /// Process the basic block for values generated in the current basic
   /// block.
   void processBasicBlocksForAvailValue();

   /// Process basic blocks to perform the redundant load elimination.
   void processBasicBlocksForRLE(bool Optimistic);

   /// Returns the alias analysis we will use during all computations.
   AliasAnalysis *getAA() const { return AA; }

   /// Returns the current type expansion analysis we are .
   TypeExpansionAnalysis *getTE() const { return TE; }

   /// Returns current epilogue release function info we are using.
   EpilogueARCFunctionInfo *getEAFI() const { return EAFI; }

   /// Returns the SILValue base to bit index.
   LSLocationBaseMap &getBM() { return BaseToLocIndex; }

   /// Return the BlockState for the basic block this basic block belongs to.
   BlockState &getBlockState(SILBasicBlock *B) { return BBToLocState[B]; }

   /// Get the bit representing the LSLocation in the LocationVault.
   unsigned getLocationBit(const LSLocation &L);

   /// Given the bit, get the LSLocation from the LocationVault.
   LSLocation &getLocation(const unsigned index);

   /// Get the bit representing the LSValue in the LSValueVault.
   unsigned getValueBit(const LSValue &L);

   /// Given the bit, get the LSValue from the LSValueVault.
   LSValue &getValue(const unsigned index);

   /// Given a LSLocation, try to collect all the LSValues for this LSLocation
   /// in the given basic block. If part of the locations have covering values,
   /// find the values in its predecessors.
   bool collectLocationValues(SILBasicBlock *BB, LSLocation &L,
                              LSLocationValueMap &Values, ValueTableMap &VM);

   /// Transitively collect all the values that make up this location and
   /// create a SILArgument out of them.
   SILValue computePredecessorLocationValue(SILBasicBlock *BB, LSLocation &L);
 };

 } // end anonymous namespace

 void BlockState::startTrackingValue(ValueTableMap &VM, unsigned L, unsigned V) {
   VM[L] = V;
 }

 void BlockState::stopTrackingValue(ValueTableMap &VM, unsigned B) {
   VM.erase(B);
 }

 bool BlockState::isTrackingLocation(llvm::SmallBitVector &BV, unsigned B) {
   return BV.test(B);
 }

 void BlockState::startTrackingLocation(llvm::SmallBitVector &BV, unsigned B) {
   BV.set(B);
 }

 void BlockState::stopTrackingLocation(llvm::SmallBitVector &BV, unsigned B) {
   BV.reset(B);
 }

 void BlockState::mergePredecessorsAvailSetMax(RLEContext &Ctx) {
   if (BB->pred_empty()) {
     ForwardSetMax.reset();
     return;
   }

   auto Iter = BB->pred_begin();
   ForwardSetMax = Ctx.getBlockState(*Iter).ForwardSetMax;
   Iter = std::next(Iter);
   for (auto EndIter = BB->pred_end(); Iter != EndIter; ++Iter) {
     ForwardSetMax &= Ctx.getBlockState(*Iter).ForwardSetMax;
   }
 }

 void BlockState::mergePredecessorAvailSet(RLEContext &Ctx) {
   // Clear the state if the basic block has no predecessor.
   if (BB->getPredecessorBlocks().begin() == BB->getPredecessorBlocks().end()) {
     ForwardSetIn.reset();
     return;
   }

   auto Iter = BB->pred_begin();
   ForwardSetIn = Ctx.getBlockState(*Iter).ForwardSetOut;
   Iter = std::next(Iter);
   for (auto EndIter = BB->pred_end(); Iter != EndIter; ++Iter) {
     ForwardSetIn &= Ctx.getBlockState(*Iter).ForwardSetOut;
   }
 }

 void BlockState::mergePredecessorAvailSetAndValue(RLEContext &Ctx) {
   // Clear the state if the basic block has no predecessor.
   if (BB->getPredecessorBlocks().begin() == BB->getPredecessorBlocks().end()) {
     ForwardSetIn.reset();
     ForwardValIn.clear();
     return;
   }

   auto Iter = BB->pred_begin();
   ForwardSetIn = Ctx.getBlockState(*Iter).ForwardSetOut;
   ForwardValIn = Ctx.getBlockState(*Iter).ForwardValOut;
   Iter = std::next(Iter);
   for (auto EndIter = BB->pred_end(); Iter != EndIter; ++Iter) {
     BlockState &OtherState = Ctx.getBlockState(*Iter);
     ForwardSetIn &= OtherState.ForwardSetOut;

     // Merge in the predecessor state.
     for (unsigned i = 0; i < LocationNum; ++i) {
       if (OtherState.ForwardSetOut[i]) {
         // There are multiple values from multiple predecessors, set this as
         // a covering value. We do not need to track the value itself, as we
         // can always go to the predecessors BlockState to find it.
         ForwardValIn[i] = Ctx.getValueBit(LSValue(true));
         continue;
       }
       // If this location does have an available value, then clear it.
       stopTrackingValue(ForwardValIn, i);
       stopTrackingLocation(ForwardSetIn, i);
     }
   }
 }

 void BlockState::processBasicBlockWithKind(RLEContext &Ctx, RLEKind Kind) {
   // Iterate over instructions in forward order.
   for (auto &II : *BB) {
     processInstructionWithKind(Ctx, &II, Kind);
   }
 }

 bool BlockState::processBasicBlockWithGenKillSet() {
   ForwardSetIn.reset(BBKillSet);
   ForwardSetIn |= BBGenSet;
   return updateForwardSetOut();
 }

 SILValue BlockState::reduceValuesAtEndOfBlock(RLEContext &Ctx, LSLocation &L) {
   // First, collect current available locations and their corresponding values
   // into a map.
   LSLocationValueMap Values;

   LSLocationList Locs;
   LSLocation::expand(L, &BB->getModule(), Locs, Ctx.getTE());

   // Find the values that this basic block defines and the locations which
   // we do not have a concrete value in the current basic block.
   ValueTableMap &OTM = getForwardValOut();
   for (unsigned i = 0; i < Locs.size(); ++i) {
     Values[Locs[i]] = Ctx.getValue(OTM[Ctx.getLocationBit(Locs[i])]);
   }

   // Second, reduce the available values into a single SILValue we can use to
   // forward.
   SILValue TheForwardingValue;
   TheForwardingValue = LSValue::reduce(L, &BB->getModule(), Values,
                                        BB->getTerminator());
   /// Return the forwarding value.
   return TheForwardingValue;
 }

 bool BlockState::setupRLE(RLEContext &Ctx, SILInstruction *I, SILValue Mem) {
   // Try to construct a SILValue for the current LSLocation.
   //
   // Collect the locations and their corresponding values into a map.
   LSLocation L;
   LSLocationBaseMap &BaseToLocIndex = Ctx.getBM();
   if (BaseToLocIndex.find(Mem) != BaseToLocIndex.end()) {
     L = BaseToLocIndex[Mem];
   } else {
     SILValue UO = getUnderlyingObject(Mem);
     L = LSLocation(UO, ProjectionPath::getProjectionPath(UO, Mem));
   }

   LSLocationValueMap Values;
   // Use the ForwardValIn as we are currently processing the basic block.
   if (!Ctx.collectLocationValues(I->getParent(), L, Values, getForwardValIn()))
     return false;

   // Reduce the available values into a single SILValue we can use to forward.
   SILModule *Mod = &I->getModule();
   SILValue TheForwardingValue = LSValue::reduce(L, Mod, Values, I);
   if (!TheForwardingValue)
     return false;

   // Now we have the forwarding value, record it for forwarding!.
   //
   // NOTE: we do not perform the RLE right here because doing so could introduce
   // new LSLocations.
   //
   // e.g.
   //    %0 = load %x
   //    %1 = load %x
   //    %2 = extract_struct %1, #a
   //    %3 = load %2
   //
   // If we perform the RLE and replace %1 with %0, we end up having a memory
   // location we do not have before, i.e. Base == %0, and Path == #a.
   //
   // We may be able to add the LSLocation to the vault, but it gets
   // complicated very quickly, e.g. we need to resize the bit vectors size,
   // etc.
   //
   // However, since we already know the instruction to replace and the value to
   // replace it with, we can record it for now and forwarded it after all the
   // forwardable values are recorded in the function.
   //
   RedundantLoads[I] = TheForwardingValue;

   DEBUG(llvm::dbgs() << "FORWARD " << TheForwardingValue << "  to" << *I);
   return true;
 }

 void BlockState::updateForwardSetForRead(RLEContext &Ctx, unsigned B) {
   startTrackingLocation(ForwardSetIn, B);
 }

 void BlockState::updateGenKillSetForRead(RLEContext &Ctx, unsigned B) {
   startTrackingLocation(BBGenSet, B);
   stopTrackingLocation(BBKillSet, B);
 }

 void BlockState::updateForwardSetAndValForRead(RLEContext &Ctx, unsigned L,
                                                unsigned V) {
   // Track the new location and value.
   startTrackingValue(ForwardValIn, L, V);
   startTrackingLocation(ForwardSetIn, L);
 }

 void BlockState::updateGenKillSetForWrite(RLEContext &Ctx, unsigned B) {
   // This is a store, invalidate any location that this location may alias, as
   // their values can no longer be forwarded.
   LSLocation &R = Ctx.getLocation(B);
   for (unsigned i = 0; i < LocationNum; ++i) {
     if (!isTrackingLocation(ForwardSetMax, i))
       continue;
     LSLocation &L = Ctx.getLocation(i);
     if (!L.isMayAliasLSLocation(R, Ctx.getAA()))
       continue;
     // MayAlias, invalidate the location.
     stopTrackingLocation(BBGenSet, i);
     startTrackingLocation(BBKillSet, i);
   }

   // Start tracking this location.
   startTrackingLocation(BBGenSet, B);
   stopTrackingLocation(BBKillSet, B);
 }

 void BlockState::updateMaxAvailSetForWrite(RLEContext &Ctx, unsigned B) {
   startTrackingLocation(ForwardSetMax, B);
 }

 void BlockState::updateMaxAvailSetForRead(RLEContext &Ctx, unsigned B) {
   startTrackingLocation(ForwardSetMax, B);
 }

 void BlockState::updateForwardSetForWrite(RLEContext &Ctx, unsigned B) {
   // This is a store, invalidate any location that this location may alias, as
   // their values can no longer be forwarded.
   LSLocation &R = Ctx.getLocation(B);
   for (unsigned i = 0; i < LocationNum; ++i) {
     if (!isTrackingLocation(ForwardSetIn, i))
       continue;
     LSLocation &L = Ctx.getLocation(i);
     if (!L.isMayAliasLSLocation(R, Ctx.getAA()))
       continue;
     // MayAlias, invalidate the location.
     stopTrackingLocation(ForwardSetIn, i);
   }

   // Start tracking this location.
   startTrackingLocation(ForwardSetIn, B);
 }

 void BlockState::updateForwardSetAndValForWrite(RLEContext &Ctx, unsigned L,
                                                 unsigned V) {
   // This is a store, invalidate any location that this location may alias, as
   // their values can no longer be forwarded.
   LSLocation &R = Ctx.getLocation(L);
   for (unsigned i = 0; i < LocationNum; ++i) {
     if (!isTrackingLocation(ForwardSetIn, i))
       continue;
     LSLocation &L = Ctx.getLocation(i);
     if (!L.isMayAliasLSLocation(R, Ctx.getAA()))
       continue;
     // MayAlias, invalidate the location and value.
     stopTrackingValue(ForwardValIn, i);
     stopTrackingLocation(ForwardSetIn, i);
   }

   // Start tracking this location and value.
   startTrackingLocation(ForwardSetIn, L);
   startTrackingValue(ForwardValIn, L, V);
 }

 void BlockState::processWrite(RLEContext &Ctx, SILInstruction *I, SILValue Mem,
                               SILValue Val, RLEKind Kind) {
   // Initialize the LSLocation.
   LSLocation L;
   LSLocationBaseMap &BaseToLocIndex = Ctx.getBM();
   if (BaseToLocIndex.find(Mem) != BaseToLocIndex.end()) {
     L = BaseToLocIndex[Mem];
   } else {
     SILValue UO = getUnderlyingObject(Mem);
     L = LSLocation(UO, ProjectionPath::getProjectionPath(UO, Mem));
   }

   // If we can't figure out the Base or Projection Path for the write,
   // process it as an unknown memory instruction.
   if (!L.isValid()) {
     // we can ignore unknown store instructions if we are computing the
     // AvailSetMax.
     if (!isComputeAvailSetMax(Kind)) {
       processUnknownWriteInst(Ctx, I, Kind);
     }
     return;
   }

   // Expand the given location and val into individual fields and process
   // them as separate writes.
   LSLocationList Locs;
   LSLocation::expand(L, &I->getModule(), Locs, Ctx.getTE());

   if (isComputeAvailSetMax(Kind)) {
     for (unsigned i = 0; i < Locs.size(); ++i) {
       updateMaxAvailSetForWrite(Ctx, Ctx.getLocationBit(Locs[i]));
     }
     return;
   }

   // Are we computing the genset and killset ?
   if (isComputeAvailGenKillSet(Kind)) {
     for (unsigned i = 0; i < Locs.size(); ++i) {
       updateGenKillSetForWrite(Ctx, Ctx.getLocationBit(Locs[i]));
     }
     return;
   }

   // Are we computing available value or performing RLE?
   LSValueList Vals;
   LSValue::expand(Val, &I->getModule(), Vals, Ctx.getTE());
   if (isComputeAvailValue(Kind) || isPerformingRLE(Kind)) {
     for (unsigned i = 0; i < Locs.size(); ++i) {
       updateForwardSetAndValForWrite(Ctx, Ctx.getLocationBit(Locs[i]),
                                      Ctx.getValueBit(Vals[i]));
     }
     return;
   }

   llvm_unreachable("Unknown RLE compute kind");
 }

 void BlockState::processRead(RLEContext &Ctx, SILInstruction *I, SILValue Mem,
                              SILValue Val, RLEKind Kind) {
   // Initialize the LSLocation.
   LSLocation L;
   LSLocationBaseMap &BaseToLocIndex = Ctx.getBM();
   if (BaseToLocIndex.find(Mem) != BaseToLocIndex.end()) {
     L = BaseToLocIndex[Mem];
   } else {
     SILValue UO = getUnderlyingObject(Mem);
     L = LSLocation(UO, ProjectionPath::getProjectionPath(UO, Mem));
   }

   // If we can't figure out the Base or Projection Path for the read, simply
   // ignore it for now.
   if (!L.isValid())
     return;

   // Expand the given LSLocation and Val into individual fields and process
   // them as separate reads.
   LSLocationList Locs;
   LSLocation::expand(L, &I->getModule(), Locs, Ctx.getTE());

   if (isComputeAvailSetMax(Kind)) {
     for (unsigned i = 0; i < Locs.size(); ++i) {
       updateMaxAvailSetForRead(Ctx, Ctx.getLocationBit(Locs[i]));
     }
     return;
   }

   // Are we computing the genset and killset.
   if (isComputeAvailGenKillSet(Kind)) {
     for (unsigned i = 0; i < Locs.size(); ++i) {
       updateGenKillSetForRead(Ctx, Ctx.getLocationBit(Locs[i]));
     }
     return;
   }

   // Are we computing available values ?.
   bool CanForward = true;
   LSValueList Vals;
   LSValue::expand(Val, &I->getModule(), Vals, Ctx.getTE());
   if (isComputeAvailValue(Kind) || isPerformingRLE(Kind)) {
     for (unsigned i = 0; i < Locs.size(); ++i) {
       if (isTrackingLocation(ForwardSetIn, Ctx.getLocationBit(Locs[i])))
         continue;
       updateForwardSetAndValForRead(Ctx, Ctx.getLocationBit(Locs[i]),
                                     Ctx.getValueBit(Vals[i]));
       // We can not perform the forwarding as we are at least missing
       // some pieces of the read location.
       CanForward = false;
     }
   }

   // Simply return if we are not performing RLE or we do not have all the
   // values available to perform RLE.
   if (!isPerformingRLE(Kind) || !CanForward)
     return;

   // Lastly, forward value to the load.
   setupRLE(Ctx, I, Mem);
 }

 void BlockState::processStoreInst(RLEContext &Ctx, StoreInst *SI, RLEKind Kind) {
   processWrite(Ctx, SI, SI->getDest(), SI->getSrc(), Kind);
 }

 void BlockState::processLoadInst(RLEContext &Ctx, LoadInst *LI, RLEKind Kind) {
   processRead(Ctx, LI, LI->getOperand(), SILValue(LI), Kind);
 }

 void BlockState::processUnknownWriteInstForGenKillSet(RLEContext &Ctx,
                                                       SILInstruction *I) {
   auto *AA = Ctx.getAA();
   for (unsigned i = 0; i < LocationNum; ++i) {
     if (!isTrackingLocation(ForwardSetMax, i))
       continue;
     // Invalidate any location this instruction may write to.
     //
     // TODO: checking may alias with Base is overly conservative,
     // we should check may alias with base plus projection path.
     LSLocation &R = Ctx.getLocation(i);
     if (!AA->mayWriteToMemory(I, R.getBase()))
       continue;
     // MayAlias.
     stopTrackingLocation(BBGenSet, i);
     startTrackingLocation(BBKillSet, i);
   }
 }

 void BlockState::processUnknownWriteInstForRLE(RLEContext &Ctx,
                                                SILInstruction *I) {
   auto *AA = Ctx.getAA();
   for (unsigned i = 0; i < LocationNum; ++i) {
     if (!isTrackingLocation(ForwardSetIn, i))
       continue;
     // Invalidate any location this instruction may write to.
     //
     // TODO: checking may alias with Base is overly conservative,
     // we should check may alias with base plus projection path.
     LSLocation &R = Ctx.getLocation(i);
     if (!AA->mayWriteToMemory(I, R.getBase()))
       continue;
     // MayAlias.
     stopTrackingLocation(ForwardSetIn, i);
     stopTrackingValue(ForwardValIn, i);
   }
 }

 void BlockState::processUnknownWriteInst(RLEContext &Ctx, SILInstruction *I,
                                          RLEKind Kind) {
   // If this is a release on a guaranteed parameter, it can not call deinit,
   // which might read or write memory.
   if (isIntermediateRelease(I, Ctx.getEAFI()))
     return;

   // Are we computing the genset and killset ?
   if (isComputeAvailGenKillSet(Kind)) {
     processUnknownWriteInstForGenKillSet(Ctx, I);
     return;
   }

   // Are we computing the available value or doing RLE ?
   if (isComputeAvailValue(Kind) || isPerformingRLE(Kind)) {
     processUnknownWriteInstForRLE(Ctx, I);
     return;
   }

   llvm_unreachable("Unknown RLE compute kind");
 }

 void BlockState::
 processDeallocStackInstForGenKillSet(RLEContext &Ctx, SILInstruction *I) {
   SILValue ASI = findAllocStackInst(I);
   for (unsigned i = 0; i < LocationNum; ++i) {
     LSLocation &R = Ctx.getLocation(i);
     if (R.getBase() != ASI)
       continue;
     // MayAlias.
     stopTrackingLocation(BBGenSet, i);
     startTrackingLocation(BBKillSet, i);
   }
 }

 void BlockState::
 processDeallocStackInstForRLE(RLEContext &Ctx, SILInstruction *I) {
   SILValue ASI = findAllocStackInst(I);
   for (unsigned i = 0; i < LocationNum; ++i) {
     LSLocation &R = Ctx.getLocation(i);
     if (R.getBase() != ASI)
       continue;
     // MayAlias.
     stopTrackingLocation(ForwardSetIn, i);
     stopTrackingValue(ForwardValIn, i);
   }
 }

 void BlockState::
 processDeallocStackInst(RLEContext &Ctx, SILInstruction *I, RLEKind Kind) {
   // Are we computing the genset and killset ?
   if (isComputeAvailGenKillSet(Kind)) {
     processDeallocStackInstForGenKillSet(Ctx, I);
     return;
   }

   // Are we computing the available value or doing RLE ?
   if (isComputeAvailValue(Kind) || isPerformingRLE(Kind)) {
     processDeallocStackInstForRLE(Ctx, I);
     return;
   }

   llvm_unreachable("Unknown RLE compute kind");
 }


 void BlockState::processInstructionWithKind(RLEContext &Ctx,
                                             SILInstruction *Inst,
                                             RLEKind Kind) {
   // This is a StoreInst, try to see whether it clobbers any forwarding value
   if (auto *SI = dyn_cast<StoreInst>(Inst)) {
     processStoreInst(Ctx, SI, Kind);
     return;
   }

   // This is a LoadInst. Let's see if we can find a previous loaded, stored
   // value to use instead of this load.
   if (auto *LI = dyn_cast<LoadInst>(Inst)) {
     processLoadInst(Ctx, LI, Kind);
     return;
   }

   if (auto *DSI = dyn_cast<DeallocStackInst>(Inst)) {
     processDeallocStackInst(Ctx, DSI, Kind);
     return;
   }

   // If this instruction has side effects, but is inert from a load store
   // perspective, skip it.
   if (isRLEInertInstruction(Inst))
     return;

   // If this instruction does not read or write memory, we can skip it.
   if (!Inst->mayReadOrWriteMemory())
     return;

   // If we have an instruction that may write to memory and we cannot prove
   // that it and its operands cannot alias a load we have visited,
   // invalidate that load.
   if (Inst->mayWriteToMemory()) {
     DEBUG(llvm::dbgs() << "WRITE " << *Inst);
     processUnknownWriteInst(Ctx, Inst, Kind);
     return;
   }
   DEBUG(llvm::dbgs() << "READ " << *Inst);
 }

 RLEContext::ProcessKind
 RLEContext::
 getProcessFunctionKind(unsigned LoadCount, unsigned StoreCount) {
   // Don't optimize function that are marked as 'no.optimize'.
   if (!Fn->shouldOptimize())
     return ProcessKind::ProcessNone;

   // Really no point optimizing here as there is no forwardable loads.
   if (LoadCount + StoreCount < 2)
     return ProcessKind::ProcessNone;

   bool RunOneIteration = true;
   unsigned BBCount = 0;
   unsigned LocationCount = LocationVault.size();

   if (LocationCount == 0)
     return ProcessKind::ProcessNone;

   // If all basic blocks will have their predecessors processed if
   // the basic blocks in the functions are iterated in post order.
   // Then this function can be processed in one iteration, i.e. no
   // need to generate the genset and killset.
   auto *PO = PM->getAnalysis<PostOrderAnalysis>()->get(Fn);
   llvm::DenseSet<SILBasicBlock *> HandledBBs;
   for (SILBasicBlock *B : PO->getReversePostOrder()) {
     ++BBCount;
     for (auto X : B->getPredecessorBlocks()) {
       if (HandledBBs.find(X) == HandledBBs.end()) {
         RunOneIteration = false;
         break;
       }
     }
     HandledBBs.insert(B);
   }

   // Data flow may take too long to run.
   if (BBCount * LocationCount > MaxLSLocationBBMultiplicationNone)
     return ProcessKind::ProcessNone;

   // This function's data flow would converge in 1 iteration.
   if (RunOneIteration)
     return ProcessKind::ProcessOneIteration;

   // We run one pessimistic data flow to do dead store elimination on
   // the function.
   if (BBCount * LocationCount > MaxLSLocationBBMultiplicationPessimistic)
     return ProcessKind::ProcessOneIteration;

   return ProcessKind::ProcessMultipleIterations;
 }

 #ifndef NDEBUG
 void BlockState::dump(RLEContext &Ctx) {
   for (unsigned i = 0; i < LocationNum; ++i) {
     if (!isTrackingLocation(ForwardSetMax, i))
       continue;

     llvm::dbgs() << "Loc #" << i << ":" << (BBGenSet[i] ? " Gen" : "")
                  << (BBKillSet[i] ? " Kill" : "");
     if (!ForwardSetIn.empty() && ForwardSetIn.test(i)) {
       llvm::dbgs() << " IN ";
       ValueTableMap::const_iterator inIter = ForwardValIn.find(i);
       if (inIter != ForwardValIn.end()) {
         if (SILValue base = Ctx.getValue(inIter->second).getBase())
           llvm::dbgs() << base;
         else
           llvm::dbgs() << "no base";
       }
     }
     if (!ForwardSetOut.empty() && ForwardSetOut.test(i)) {
       llvm::dbgs() << " OUT ";
       ValueTableMap::const_iterator outIter = ForwardValOut.find(i);
       if (outIter != ForwardValOut.end()) {
         if (SILValue base = Ctx.getValue(outIter->second).getBase())
           llvm::dbgs() << base;
         else
           llvm::dbgs() << "no base";
       }
     }
     llvm::dbgs() << "\n";
   }
 }
 #endif

 //===----------------------------------------------------------------------===//
 //                          RLEContext Implementation
 //===----------------------------------------------------------------------===//

 RLEContext::RLEContext(SILFunction *F, SILPassManager *PM, AliasAnalysis *AA,
                        TypeExpansionAnalysis *TE, PostOrderFunctionInfo *PO,
                        EpilogueARCFunctionInfo *EAFI)
     : Fn(F), PM(PM), AA(AA), TE(TE), PO(PO), EAFI(EAFI)
 #ifndef NDEBUG
       ,
       printCtx(llvm::dbgs(), /*Verbose=*/false, /*Sorted=*/true)
 #endif
 {
 }

 LSLocation &RLEContext::getLocation(const unsigned index) {
   return LocationVault[index];
 }

 unsigned RLEContext::getLocationBit(const LSLocation &Loc) {
   // Return the bit position of the given Loc in the LocationVault. The bit
   // position is then used to set/reset the bitvector kept by each BlockState.
   //
   // We should have the location populated by the enumerateLSLocation at this
   // point.
   auto Iter = LocToBitIndex.find(Loc);
   assert(Iter != LocToBitIndex.end() && "Location should have been enum'ed");
   return Iter->second;
 }

 LSValue &RLEContext::getValue(const unsigned index) {
   return LSValueVault[index];
 }

 unsigned RLEContext::getValueBit(const LSValue &Val) {
   // Return the bit position of the given Val in the LSValueVault. The bit
   // position is then used to set/reset the bitvector kept by each g.
   auto Iter = ValToBitIndex.find(Val);

   // We do not walk over the function and find all the possible LSValues
   // in this function, as some of the these values will not be used, i.e.
   // if the LoadInst that generates this value is actually RLE'ed.
   // Instead, we create the LSValues when we need them.
   if (Iter == ValToBitIndex.end()) {
     ValToBitIndex[Val] = LSValueVault.size();
     LSValueVault.push_back(Val);
     return ValToBitIndex[Val];
   }
   return Iter->second;
 }

 BlockState::ValueState BlockState::getValueStateAtEndOfBlock(RLEContext &Ctx,
                                                              LSLocation &L) {
   // Find number of covering value and concrete values for the locations
   // expanded from the given location.
   unsigned CSCount = 0, CTCount = 0;
   LSLocationList Locs;
   LSLocation::expand(L, &BB->getModule(), Locs, Ctx.getTE());

   ValueTableMap &OTM = getForwardValOut();
   for (auto &X : Locs) {
     LSValue &V = Ctx.getValue(OTM[Ctx.getLocationBit(X)]);
     if (V.isCoveringValue()) {
       ++CSCount;
       continue;
     }
     ++CTCount;
   }

   if (CSCount == Locs.size())
     return ValueState::CoverValues;
   if (CTCount == Locs.size())
     return ValueState::ConcreteValues;
   return ValueState::CoverAndConcreteValues;
 }

 SILValue RLEContext::computePredecessorLocationValue(SILBasicBlock *BB,
                                                      LSLocation &L) {
   BBValueMap Values;
   llvm::DenseSet<SILBasicBlock *> HandledBBs;
   llvm::SmallVector<SILBasicBlock *, 8> WorkList;

   // Push in all the predecessors to get started.
   for (auto Pred : BB->getPredecessorBlocks()) {
     WorkList.push_back(Pred);
   }

   while (!WorkList.empty()) {
     auto *CurBB = WorkList.pop_back_val();
     BlockState &Forwarder = getBlockState(CurBB);

     // Mark this basic block as processed.
     HandledBBs.insert(CurBB);

     // There are 3 cases that can happen here.
     //
     // 1. The current basic block contains concrete values for the entire
     //    location.
     // 2. The current basic block contains covering values for the entire
     //    location.
     // 3. The current basic block contains concrete value for part of the
     //    location and covering values for the rest.
     //
     // This BlockState contains concrete values for all the expanded
     // locations, collect and reduce them into a single value in the current
     // basic block.
     if (Forwarder.isConcreteValues(*this, L)) {
       Values[CurBB] = Forwarder.reduceValuesAtEndOfBlock(*this, L);
       continue;
     }

     // This BlockState does not contain concrete value for any of the expanded
     // locations, collect in this block's predecessors.
     if (Forwarder.isCoverValues(*this, L)) {
       for (auto Pred : CurBB->getPredecessorBlocks()) {
         if (HandledBBs.find(Pred) != HandledBBs.end())
           continue;
         WorkList.push_back(Pred);
       }
       continue;
     }

     // This block contains concrete values for some but not all the expanded
     // locations, recursively call collectLocationValues to materialize the
     // value that reaches this basic block.
     LSLocationValueMap LSValues;
     if (!collectLocationValues(CurBB, L, LSValues, Forwarder.getForwardValOut()))
       return SILValue();

     // Reduce the available values into a single SILValue we can use to forward
     SILInstruction *IPt = CurBB->getTerminator();
     Values[CurBB] = LSValue::reduce(L, &BB->getModule(), LSValues, IPt);
   }

   // Finally, collect all the values for the SILArgument, materialize it using
   // the SSAUpdater.
   Updater.Initialize(L.getType(&BB->getModule()).getObjectType());
   for (auto V : Values) {
     Updater.AddAvailableValue(V.first, V.second);
   }

   return Updater.GetValueInMiddleOfBlock(BB);
 }

 bool RLEContext::collectLocationValues(SILBasicBlock *BB, LSLocation &L,
                                        LSLocationValueMap &Values,
                                        ValueTableMap &VM) {
   LSLocationSet CSLocs;
   LSLocationList Locs;
   LSLocation::expand(L, &BB->getModule(), Locs, TE);

   auto *Mod = &BB->getModule();
   // Find the locations that this basic block defines and the locations which
   // we do not have a concrete value in the current basic block.
   for (auto &X : Locs) {
     Values[X] = getValue(VM[getLocationBit(X)]);
     if (!Values[X].isCoveringValue())
       continue;
     CSLocs.insert(X);
   }

   // For locations which we do not have concrete values for in this basic
   // block, try to reduce it to the minimum # of locations possible, this
   // will help us to generate as few SILArguments as possible.
   LSLocation::reduce(L, Mod, CSLocs);

   // To handle covering value, we need to go to the predecessors and
   // materialize them there.
   for (auto &X : CSLocs) {
     SILValue V = computePredecessorLocationValue(BB, X);
     if (!V)
       return false;

     // We've constructed a concrete value for the covering value. Expand and
     // collect the newly created forwardable values.
     LSLocationList Locs;
     LSValueList Vals;
     LSLocation::expand(X, Mod, Locs, TE);
     LSValue::expand(V, Mod, Vals, TE);

     for (unsigned i = 0; i < Locs.size(); ++i) {
       Values[Locs[i]] = Vals[i];
       assert(Values[Locs[i]].isValid() && "Invalid load store value");
     }
   }
   return true;
 }

 void RLEContext::processBasicBlocksForGenKillSet() {
   for (SILBasicBlock *BB : PO->getReversePostOrder()) {
     DEBUG(llvm::dbgs() << "PROCESS " << printCtx.getID(BB)
                        << " for Gen/Kill:\n";
           BB->print(llvm::dbgs(), printCtx));

     BlockState &S = getBlockState(BB);

     // Compute the AvailSetMax at the beginning of the basic block.
     S.mergePredecessorsAvailSetMax(*this);

     // Compute the genset and killset.
     //
     // To optimize this process, we also compute the AvailSetMax at particular
     // point in the basic block.
     for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
       if (auto *LI = dyn_cast<LoadInst>(&*I)) {
         if (BBWithLoads.find(BB) == BBWithLoads.end())
           BBWithLoads.insert(BB);
         S.processLoadInst(*this, LI, RLEKind::ComputeAvailSetMax);
       }
       if (auto *SI = dyn_cast<StoreInst>(&*I)) {
         S.processStoreInst(*this, SI, RLEKind::ComputeAvailSetMax);
       }

       S.processInstructionWithKind(*this, &*I, RLEKind::ComputeAvailGenKillSet);
     }
     DEBUG(S.dump(*this));
   }
 }

 void RLEContext::processBasicBlocksWithGenKillSet() {
   // Process each basic block with the gen and kill set. Every time the
   // ForwardSetOut of a basic block changes, the optimization is rerun on its
   // successors.
   llvm::SmallVector<SILBasicBlock *, 16> WorkList;
   llvm::DenseSet<SILBasicBlock *> HandledBBs;

   // Push into the worklist in post order so that we can pop from the back and
   // get reverse post order.
   for (SILBasicBlock *BB : PO->getPostOrder()) {
     WorkList.push_back(BB);
     HandledBBs.insert(BB);
   }
   while (!WorkList.empty()) {
     SILBasicBlock *BB = WorkList.pop_back_val();
     DEBUG(llvm::dbgs() << "PROCESS " << printCtx.getID(BB)
                        << " with Gen/Kill.\n");
     HandledBBs.erase(BB);

     // Intersection.
     BlockState &Forwarder = getBlockState(BB);
     // Compute the ForwardSetIn at the beginning of the basic block.
     Forwarder.mergePredecessorAvailSet(*this);

     if (Forwarder.processBasicBlockWithGenKillSet()) {
       for (auto &X : BB->getSuccessors()) {
         // We do not push basic block into the worklist if its already
         // in the worklist.
         if (HandledBBs.find(X) != HandledBBs.end())
           continue;
         WorkList.push_back(X);
       }
     }
     DEBUG(Forwarder.dump(*this));
   }
 }

 void RLEContext::processBasicBlocksForAvailValue() {
   for (SILBasicBlock *BB : PO->getReversePostOrder()) {
     DEBUG(llvm::dbgs() << "PROCESS " << printCtx.getID(BB)
                        << " for available.\n");

     BlockState &Forwarder = getBlockState(BB);

     // Merge the predecessors. After merging, BlockState now contains
     // lists of available LSLocations and their values that reach the
     // beginning of the basic block along all paths.
     Forwarder.mergePredecessorAvailSetAndValue(*this);

     // Merge duplicate loads, and forward stores to
     // loads. We also update lists of stores|loads to reflect the end
     // of the basic block.
     Forwarder.processBasicBlockWithKind(*this, RLEKind::ComputeAvailValue);

     // Update the locations with available values. We do not need to update
     // the available BitVector here as they should have been initialized and
     // stabilized in the processBasicBlocksWithGenKillSet.
     Forwarder.updateForwardValOut();

     DEBUG(Forwarder.dump(*this));
   }
 }

 void RLEContext::processBasicBlocksForRLE(bool Optimistic) {
   for (SILBasicBlock *BB : PO->getReversePostOrder()) {
     DEBUG(llvm::dbgs() << "PROCESS " << printCtx.getID(BB) << " for RLE.\n");

     // If we know this is not a one iteration function which means its
     // forward sets have been computed and converged,
     // and this basic block does not even have LoadInsts, there is no point
     // in processing every instruction in the basic block again as no store
     // will be eliminated.
     if (Optimistic && BBWithLoads.find(BB) == BBWithLoads.end())
       continue;

     BlockState &Forwarder = getBlockState(BB);

     // Merge the predecessors. After merging, BlockState now contains
     // lists of available LSLocations and their values that reach the
     // beginning of the basic block along all paths.
     Forwarder.mergePredecessorAvailSetAndValue(*this);

     DEBUG(Forwarder.dump(*this));

     // Perform the actual redundant load elimination.
     Forwarder.processBasicBlockWithKind(*this, RLEKind::PerformRLE);

     // If this is not a one iteration data flow, then the forward sets
     // have been computed.
     if (Optimistic)
       continue;

     // Update the locations with available values and their values.
     Forwarder.updateForwardSetOut();
     Forwarder.updateForwardValOut();
   }
 }

 void RLEContext::runIterativeRLE() {
   // Generate the genset and killset for every basic block.
   processBasicBlocksForGenKillSet();

   // Process basic blocks in RPO. After the data flow converges, run last
   // iteration and perform load forwarding.
   processBasicBlocksWithGenKillSet();

   // We have computed the available value bit, now go through every basic
   // block and compute the forwarding value locally. This is necessary as
   // when we perform the RLE in the last iteration, we must handle loops, i.e.
   // predecessor blocks which have not been processed when a basic block is
   // processed.
   processBasicBlocksForAvailValue();
 }

 bool RLEContext::run() {
   // We perform redundant load elimination in the following phases.
   //
   // Phase 1. Compute the genset and killset for every basic block.
   //
   // Phase 2. Use an iterative data flow to compute whether there is an
   // available value at a given point, we do not yet care about what the value
   // is.
   //
   // Phase 3. we compute the real forwardable value at a given point.
   //
   // Phase 4. we perform the redundant load elimination.
   // Walk over the function and find all the locations accessed by
   // this function.
   std::pair<int, int> LSCount = std::make_pair(0, 0);
   LSLocation::enumerateLSLocations(*Fn, LocationVault,
                                    LocToBitIndex,
                                    BaseToLocIndex, TE,
                                    LSCount);

   // Check how to optimize this function.
   ProcessKind Kind = getProcessFunctionKind(LSCount.first, LSCount.second);

   // We do not optimize this function at all.
   if (Kind == ProcessKind::ProcessNone)
     return false;

   // Do we run a multi-iteration data flow ?
   bool Optimistic = Kind == ProcessKind::ProcessMultipleIterations ?
                         true : false;

   // These are a list of basic blocks that we actually processed.
   // We do not process unreachable block, instead we set their liveouts to nil.
   llvm::DenseSet<SILBasicBlock *> BBToProcess;
   for (auto X : PO->getPostOrder())
     BBToProcess.insert(X);

   // For all basic blocks in the function, initialize a BB state. Since we
   // know all the locations accessed in this function, we can resize the bit
   // vector to the appropriate size.
   for (auto &B : *Fn) {
     BBToLocState[&B] = BlockState();
     BBToLocState[&B].init(&B, LocationVault.size(), Optimistic &&
                           BBToProcess.find(&B) != BBToProcess.end());
   }

   DEBUG(for (unsigned i = 0; i < LocationVault.size(); ++i) {
     llvm::dbgs() << "LSLocation #" << i;
     getLocation(i).print(llvm::dbgs(), &Fn->getModule());
   });

   if (Optimistic)
     runIterativeRLE();

   // We have the available value bit computed and the local forwarding value.
   // Set up the load forwarding.
   processBasicBlocksForRLE(Optimistic);

   // Finally, perform the redundant load replacements.
   llvm::DenseSet<SILInstruction *> InstsToDelete;
   bool SILChanged = false;
   for (auto &B : *Fn) {
     auto &State = BBToLocState[&B];
     auto &Loads = State.getRL();
     // Nothing to forward.
     if (Loads.empty())
       continue;
     // We iterate the instructions in the basic block in a deterministic order
     // and use this order to perform the load forwarding.
     //
     // NOTE: we could end up with different SIL depending on the ordering load
     // forwarding is performed.
     for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) {
       auto Iter = Loads.find(&*I);
       if (Iter == Loads.end())
         continue;
       DEBUG(llvm::dbgs() << "Replacing  " << SILValue(Iter->first) << "With "
             << Iter->second);
       SILChanged = true;
       Iter->first->replaceAllUsesWith(Iter->second);
       InstsToDelete.insert(Iter->first);
       ++NumForwardedLoads;
     }
   }

   // Erase the instructions recursively, this way, we get rid of pass
   // dependence on DCE.
   for (auto &X : InstsToDelete) {
     // It is possible that the instruction still has uses, because it could be
     // used as the replacement Value, i.e. F.second, for some other RLE pairs.
     //
     // TODO: we should fix this, otherwise we are missing RLE opportunities.
     if (!X->use_empty())
       continue;
     recursivelyDeleteTriviallyDeadInstructions(X, true);
   }
   return SILChanged;
 }

 //===----------------------------------------------------------------------===//
 //                           Top Level Entry Point
 //===----------------------------------------------------------------------===//

 namespace {

 class RedundantLoadElimination : public SILFunctionTransform {

   StringRef getName() override { return "SIL Redundant Load Elimination"; }

   /// The entry point to the transformation.
   void run() override {
     SILFunction *F = getFunction();
     DEBUG(llvm::dbgs() << "*** RLE on function: " << F->getName() << " ***\n");

     auto *AA = PM->getAnalysis<AliasAnalysis>();
     auto *TE = PM->getAnalysis<TypeExpansionAnalysis>();
     auto *PO = PM->getAnalysis<PostOrderAnalysis>()->get(F);
     auto *EAFI = PM->getAnalysis<EpilogueARCAnalysis>()->get(F);

     RLEContext RLE(F, PM, AA, TE, PO, EAFI);
     if (RLE.run()) {
       invalidateAnalysis(SILAnalysis::InvalidationKind::Instructions);
     }
   }
 };

 } // end anonymous namespace

 SILTransform *swift::createRedundantLoadElimination() {
   return new RedundantLoadElimination();
 }