| //===--- PerformanceInliner.cpp - Basic cost based performance inlining ---===// |
| // |
| // This source file is part of the Swift.org open source project |
| // |
| // Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors |
| // Licensed under Apache License v2.0 with Runtime Library Exception |
| // |
| // See http://swift.org/LICENSE.txt for license information |
| // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #define DEBUG_TYPE "sil-inliner" |
| #include "swift/SIL/SILInstruction.h" |
| #include "swift/SIL/Dominance.h" |
| #include "swift/SIL/SILModule.h" |
| #include "swift/SIL/Projection.h" |
| #include "swift/SIL/InstructionUtils.h" |
| #include "swift/SILOptimizer/Analysis/ColdBlockInfo.h" |
| #include "swift/SILOptimizer/Analysis/DominanceAnalysis.h" |
| #include "swift/SILOptimizer/Analysis/FunctionOrder.h" |
| #include "swift/SILOptimizer/Analysis/LoopAnalysis.h" |
| #include "swift/SILOptimizer/Analysis/ArraySemantic.h" |
| #include "swift/SILOptimizer/PassManager/Passes.h" |
| #include "swift/SILOptimizer/PassManager/Transforms.h" |
| #include "swift/SILOptimizer/Utils/Local.h" |
| #include "swift/SILOptimizer/Utils/ConstantFolding.h" |
| #include "swift/SILOptimizer/Utils/SILInliner.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/ADT/MapVector.h" |
| #include <functional> |
| |
| |
| using namespace swift; |
| |
| STATISTIC(NumFunctionsInlined, "Number of functions inlined"); |
| |
| llvm::cl::opt<bool> PrintShortestPathInfo( |
| "print-shortest-path-info", llvm::cl::init(false), |
| llvm::cl::desc("Print shortest-path information for inlining")); |
| |
| //===----------------------------------------------------------------------===// |
| // ConstantTracker |
| //===----------------------------------------------------------------------===// |
| |
| // Tracks constants in the caller and callee to get an estimation of what |
| // values get constant if the callee is inlined. |
| // This can be seen as a "simulation" of several optimizations: SROA, mem2reg |
| // and constant propagation. |
| // Note that this is only a simplified model and not correct in all cases. |
| // For example aliasing information is not taken into account. |
| class ConstantTracker { |
| |
| // Represents a value in integer constant evaluation. |
| struct IntConst { |
| IntConst() : isValid(false), isFromCaller(false) { } |
| |
| IntConst(const APInt &value, bool isFromCaller) : |
| value(value), isValid(true), isFromCaller(isFromCaller) { } |
| |
| // The actual value. |
| APInt value; |
| |
| // True if the value is valid, i.e. could be evaluated to a constant. |
| bool isValid; |
| |
| // True if the value is only valid, because a constant is passed to the |
| // callee. False if constant propagation could do the same job inside the |
| // callee without inlining it. |
| bool isFromCaller; |
| }; |
| |
| // Links between loaded and stored values. |
| // The key is a load instruction, the value is the corresponding store |
| // instruction which stores the loaded value. Both, key and value can also |
| // be copy_addr instructions. |
| llvm::DenseMap<SILInstruction *, SILInstruction *> links; |
| |
| // The current stored values at memory addresses. |
| // The key is the base address of the memory (after skipping address |
| // projections). The value are store (or copy_addr) instructions, which |
| // store the current value. |
| // This is only an estimation, because e.g. it does not consider potential |
| // aliasing. |
| llvm::DenseMap<SILValue, SILInstruction *> memoryContent; |
| |
| // Cache for evaluated constants. |
| llvm::SmallDenseMap<BuiltinInst *, IntConst> constCache; |
| |
| // The caller/callee function which is tracked. |
| SILFunction *F; |
| |
| // The constant tracker of the caller function (null if this is the |
| // tracker of the callee). |
| ConstantTracker *callerTracker; |
| |
| // The apply instruction in the caller (null if this is the tracker of the |
| // callee). |
| FullApplySite AI; |
| |
| // Walks through address projections and (optionally) collects them. |
| // Returns the base address, i.e. the first address which is not a |
| // projection. |
| SILValue scanProjections(SILValue addr, |
| SmallVectorImpl<Projection> *Result = nullptr); |
| |
| // Get the stored value for a load. The loadInst can be either a real load |
| // or a copy_addr. |
| SILValue getStoredValue(SILInstruction *loadInst, |
| ProjectionPath &projStack); |
| |
| // Gets the parameter in the caller for a function argument. |
| SILValue getParam(SILValue value) { |
| if (SILArgument *arg = dyn_cast<SILArgument>(value)) { |
| if (AI && arg->isFunctionArg() && arg->getFunction() == F) { |
| // Continue at the caller. |
| return AI.getArgument(arg->getIndex()); |
| } |
| } |
| return SILValue(); |
| } |
| |
| SILInstruction *getMemoryContent(SILValue addr) { |
| // The memory content can be stored in this ConstantTracker or in the |
| // caller's ConstantTracker. |
| SILInstruction *storeInst = memoryContent[addr]; |
| if (storeInst) |
| return storeInst; |
| if (callerTracker) |
| return callerTracker->getMemoryContent(addr); |
| return nullptr; |
| } |
| |
| // Gets the estimated definition of a value. |
| SILInstruction *getDef(SILValue val, ProjectionPath &projStack); |
| |
| // Gets the estimated integer constant result of a builtin. |
| IntConst getBuiltinConst(BuiltinInst *BI, int depth); |
| |
| public: |
| |
| // Constructor for the caller function. |
| ConstantTracker(SILFunction *function) : |
| F(function), callerTracker(nullptr), AI() |
| { } |
| |
| // Constructor for the callee function. |
| ConstantTracker(SILFunction *function, ConstantTracker *caller, |
| FullApplySite callerApply) : |
| F(function), callerTracker(caller), AI(callerApply) |
| { } |
| |
| void beginBlock() { |
| // Currently we don't do any sophisticated dataflow analysis, so we keep |
| // the memoryContent alive only for a single block. |
| memoryContent.clear(); |
| } |
| |
| // Must be called for each instruction visited in dominance order. |
| void trackInst(SILInstruction *inst); |
| |
| // Gets the estimated definition of a value. |
| SILInstruction *getDef(SILValue val) { |
| ProjectionPath projStack(val->getType()); |
| return getDef(val, projStack); |
| } |
| |
| // Gets the estimated definition of a value if it is in the caller. |
| SILInstruction *getDefInCaller(SILValue val) { |
| SILInstruction *def = getDef(val); |
| if (def && def->getFunction() != F) |
| return def; |
| return nullptr; |
| } |
| |
| bool isStackAddrInCaller(SILValue val) { |
| if (SILValue Param = getParam(stripAddressProjections(val))) { |
| return isa<AllocStackInst>(stripAddressProjections(Param)); |
| } |
| return false; |
| } |
| |
| // Gets the estimated integer constant of a value. |
| IntConst getIntConst(SILValue val, int depth = 0); |
| |
| SILBasicBlock *getTakenBlock(TermInst *term); |
| }; |
| |
| void ConstantTracker::trackInst(SILInstruction *inst) { |
| if (auto *LI = dyn_cast<LoadInst>(inst)) { |
| SILValue baseAddr = scanProjections(LI->getOperand()); |
| if (SILInstruction *loadLink = getMemoryContent(baseAddr)) |
| links[LI] = loadLink; |
| } else if (StoreInst *SI = dyn_cast<StoreInst>(inst)) { |
| SILValue baseAddr = scanProjections(SI->getOperand(1)); |
| memoryContent[baseAddr] = SI; |
| } else if (CopyAddrInst *CAI = dyn_cast<CopyAddrInst>(inst)) { |
| if (!CAI->isTakeOfSrc()) { |
| // Treat a copy_addr as a load + store |
| SILValue loadAddr = scanProjections(CAI->getOperand(0)); |
| if (SILInstruction *loadLink = getMemoryContent(loadAddr)) { |
| links[CAI] = loadLink; |
| SILValue storeAddr = scanProjections(CAI->getOperand(1)); |
| memoryContent[storeAddr] = CAI; |
| } |
| } |
| } |
| } |
| |
| SILValue ConstantTracker::scanProjections(SILValue addr, |
| SmallVectorImpl<Projection> *Result) { |
| for (;;) { |
| if (Projection::isAddressProjection(addr)) { |
| SILInstruction *I = cast<SILInstruction>(addr); |
| if (Result) { |
| Result->push_back(Projection(I)); |
| } |
| addr = I->getOperand(0); |
| continue; |
| } |
| if (SILValue param = getParam(addr)) { |
| // Go to the caller. |
| addr = param; |
| continue; |
| } |
| // Return the base address = the first address which is not a projection. |
| return addr; |
| } |
| } |
| |
| SILValue ConstantTracker::getStoredValue(SILInstruction *loadInst, |
| ProjectionPath &projStack) { |
| SILInstruction *store = links[loadInst]; |
| if (!store && callerTracker) |
| store = callerTracker->links[loadInst]; |
| if (!store) return SILValue(); |
| |
| assert(isa<LoadInst>(loadInst) || isa<CopyAddrInst>(loadInst)); |
| |
| // Push the address projections of the load onto the stack. |
| SmallVector<Projection, 4> loadProjections; |
| scanProjections(loadInst->getOperand(0), &loadProjections); |
| for (const Projection &proj : loadProjections) { |
| projStack.push_back(proj); |
| } |
| |
| // Pop the address projections of the store from the stack. |
| SmallVector<Projection, 4> storeProjections; |
| scanProjections(store->getOperand(1), &storeProjections); |
| for (auto iter = storeProjections.rbegin(); iter != storeProjections.rend(); |
| ++iter) { |
| const Projection &proj = *iter; |
| // The corresponding load-projection must match the store-projection. |
| if (projStack.empty() || projStack.back() != proj) |
| return SILValue(); |
| projStack.pop_back(); |
| } |
| |
| if (isa<StoreInst>(store)) |
| return store->getOperand(0); |
| |
| // The copy_addr instruction is both a load and a store. So we follow the link |
| // again. |
| assert(isa<CopyAddrInst>(store)); |
| return getStoredValue(store, projStack); |
| } |
| |
| // Get the aggregate member based on the top of the projection stack. |
| static SILValue getMember(SILInstruction *inst, ProjectionPath &projStack) { |
| if (!projStack.empty()) { |
| const Projection &proj = projStack.back(); |
| return proj.getOperandForAggregate(inst); |
| } |
| return SILValue(); |
| } |
| |
| SILInstruction *ConstantTracker::getDef(SILValue val, |
| ProjectionPath &projStack) { |
| |
| // Track the value up the dominator tree. |
| for (;;) { |
| if (SILInstruction *inst = dyn_cast<SILInstruction>(val)) { |
| if (Projection::isObjectProjection(inst)) { |
| // Extract a member from a struct/tuple/enum. |
| projStack.push_back(Projection(inst)); |
| val = inst->getOperand(0); |
| continue; |
| } else if (SILValue member = getMember(inst, projStack)) { |
| // The opposite of a projection instruction: composing a struct/tuple. |
| projStack.pop_back(); |
| val = member; |
| continue; |
| } else if (SILValue loadedVal = getStoredValue(inst, projStack)) { |
| // A value loaded from memory. |
| val = loadedVal; |
| continue; |
| } else if (isa<ThinToThickFunctionInst>(inst)) { |
| val = inst->getOperand(0); |
| continue; |
| } |
| return inst; |
| } else if (SILValue param = getParam(val)) { |
| // Continue in the caller. |
| val = param; |
| continue; |
| } |
| return nullptr; |
| } |
| } |
| |
| ConstantTracker::IntConst ConstantTracker::getBuiltinConst(BuiltinInst *BI, int depth) { |
| const BuiltinInfo &Builtin = BI->getBuiltinInfo(); |
| OperandValueArrayRef Args = BI->getArguments(); |
| switch (Builtin.ID) { |
| default: break; |
| |
| // Fold comparison predicates. |
| #define BUILTIN(id, name, Attrs) |
| #define BUILTIN_BINARY_PREDICATE(id, name, attrs, overload) \ |
| case BuiltinValueKind::id: |
| #include "swift/AST/Builtins.def" |
| { |
| IntConst lhs = getIntConst(Args[0], depth); |
| IntConst rhs = getIntConst(Args[1], depth); |
| if (lhs.isValid && rhs.isValid) { |
| return IntConst(constantFoldComparison(lhs.value, rhs.value, |
| Builtin.ID), |
| lhs.isFromCaller || rhs.isFromCaller); |
| } |
| break; |
| } |
| |
| case BuiltinValueKind::SAddOver: |
| case BuiltinValueKind::UAddOver: |
| case BuiltinValueKind::SSubOver: |
| case BuiltinValueKind::USubOver: |
| case BuiltinValueKind::SMulOver: |
| case BuiltinValueKind::UMulOver: { |
| IntConst lhs = getIntConst(Args[0], depth); |
| IntConst rhs = getIntConst(Args[1], depth); |
| if (lhs.isValid && rhs.isValid) { |
| bool IgnoredOverflow; |
| return IntConst(constantFoldBinaryWithOverflow(lhs.value, rhs.value, |
| IgnoredOverflow, |
| getLLVMIntrinsicIDForBuiltinWithOverflow(Builtin.ID)), |
| lhs.isFromCaller || rhs.isFromCaller); |
| } |
| break; |
| } |
| |
| case BuiltinValueKind::SDiv: |
| case BuiltinValueKind::SRem: |
| case BuiltinValueKind::UDiv: |
| case BuiltinValueKind::URem: { |
| IntConst lhs = getIntConst(Args[0], depth); |
| IntConst rhs = getIntConst(Args[1], depth); |
| if (lhs.isValid && rhs.isValid && rhs.value != 0) { |
| bool IgnoredOverflow; |
| return IntConst(constantFoldDiv(lhs.value, rhs.value, |
| IgnoredOverflow, Builtin.ID), |
| lhs.isFromCaller || rhs.isFromCaller); |
| } |
| break; |
| } |
| |
| case BuiltinValueKind::And: |
| case BuiltinValueKind::AShr: |
| case BuiltinValueKind::LShr: |
| case BuiltinValueKind::Or: |
| case BuiltinValueKind::Shl: |
| case BuiltinValueKind::Xor: { |
| IntConst lhs = getIntConst(Args[0], depth); |
| IntConst rhs = getIntConst(Args[1], depth); |
| if (lhs.isValid && rhs.isValid) { |
| return IntConst(constantFoldBitOperation(lhs.value, rhs.value, |
| Builtin.ID), |
| lhs.isFromCaller || rhs.isFromCaller); |
| } |
| break; |
| } |
| |
| case BuiltinValueKind::Trunc: |
| case BuiltinValueKind::ZExt: |
| case BuiltinValueKind::SExt: |
| case BuiltinValueKind::TruncOrBitCast: |
| case BuiltinValueKind::ZExtOrBitCast: |
| case BuiltinValueKind::SExtOrBitCast: { |
| IntConst val = getIntConst(Args[0], depth); |
| if (val.isValid) { |
| return IntConst(constantFoldCast(val.value, Builtin), val.isFromCaller); |
| } |
| break; |
| } |
| } |
| return IntConst(); |
| } |
| |
| // Tries to evaluate the integer constant of a value. The \p depth is used |
| // to limit the complexity. |
| ConstantTracker::IntConst ConstantTracker::getIntConst(SILValue val, int depth) { |
| |
| // Don't spend too much time with constant evaluation. |
| if (depth >= 10) |
| return IntConst(); |
| |
| SILInstruction *I = getDef(val); |
| if (!I) |
| return IntConst(); |
| |
| if (auto *IL = dyn_cast<IntegerLiteralInst>(I)) { |
| return IntConst(IL->getValue(), IL->getFunction() != F); |
| } |
| if (auto *BI = dyn_cast<BuiltinInst>(I)) { |
| if (constCache.count(BI) != 0) |
| return constCache[BI]; |
| |
| IntConst builtinConst = getBuiltinConst(BI, depth + 1); |
| constCache[BI] = builtinConst; |
| return builtinConst; |
| } |
| return IntConst(); |
| } |
| |
| // Returns the taken block of a terminator instruction if the condition turns |
| // out to be constant. |
| SILBasicBlock *ConstantTracker::getTakenBlock(TermInst *term) { |
| if (CondBranchInst *CBI = dyn_cast<CondBranchInst>(term)) { |
| IntConst condConst = getIntConst(CBI->getCondition()); |
| if (condConst.isFromCaller) { |
| return condConst.value != 0 ? CBI->getTrueBB() : CBI->getFalseBB(); |
| } |
| return nullptr; |
| } |
| if (SwitchValueInst *SVI = dyn_cast<SwitchValueInst>(term)) { |
| IntConst switchConst = getIntConst(SVI->getOperand()); |
| if (switchConst.isFromCaller) { |
| for (unsigned Idx = 0; Idx < SVI->getNumCases(); ++Idx) { |
| auto switchCase = SVI->getCase(Idx); |
| if (auto *IL = dyn_cast<IntegerLiteralInst>(switchCase.first)) { |
| if (switchConst.value == IL->getValue()) |
| return switchCase.second; |
| } else { |
| return nullptr; |
| } |
| } |
| if (SVI->hasDefault()) |
| return SVI->getDefaultBB(); |
| } |
| return nullptr; |
| } |
| if (SwitchEnumInst *SEI = dyn_cast<SwitchEnumInst>(term)) { |
| if (SILInstruction *def = getDefInCaller(SEI->getOperand())) { |
| if (EnumInst *EI = dyn_cast<EnumInst>(def)) { |
| for (unsigned Idx = 0; Idx < SEI->getNumCases(); ++Idx) { |
| auto enumCase = SEI->getCase(Idx); |
| if (enumCase.first == EI->getElement()) |
| return enumCase.second; |
| } |
| if (SEI->hasDefault()) |
| return SEI->getDefaultBB(); |
| } |
| } |
| return nullptr; |
| } |
| if (CheckedCastBranchInst *CCB = dyn_cast<CheckedCastBranchInst>(term)) { |
| if (SILInstruction *def = getDefInCaller(CCB->getOperand())) { |
| if (UpcastInst *UCI = dyn_cast<UpcastInst>(def)) { |
| SILType castType = UCI->getOperand()->getType(); |
| if (CCB->getCastType().isExactSuperclassOf(castType)) { |
| return CCB->getSuccessBB(); |
| } |
| if (!castType.isBindableToSuperclassOf(CCB->getCastType())) { |
| return CCB->getFailureBB(); |
| } |
| } |
| } |
| } |
| return nullptr; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Shortest path analysis |
| //===----------------------------------------------------------------------===// |
| |
| /// Computes the length of the shortest path through a block in a scope. |
| /// |
| /// A scope is either a loop or the whole function. The "length" is an |
| /// estimation of the execution time. For example if the shortest path of a |
| /// block B is N, it means that the execution time of the scope (either |
| /// function or a single loop iteration), when execution includes the block, is |
| /// at least N, regardless of what branches are taken. |
| /// |
| /// The shortest path of a block is the sum of the shortest path from the scope |
| /// entry and the shortest path to the scope exit. |
| class ShortestPathAnalysis { |
| public: |
| enum { |
| /// We assume that cold block take very long to execute. |
| ColdBlockLength = 1000, |
| |
| /// Our assumption on how many times a loop is executed. |
| LoopCount = 10, |
| |
| /// To keep things simple we only analyse up to this number of nested loops. |
| MaxNumLoopLevels = 4, |
| |
| /// The "weight" for the benefit which a single loop nest gives. |
| SingleLoopWeight = 4, |
| |
| /// Pretty large but small enough to add something without overflowing. |
| InitialDist = (1 << 29) |
| }; |
| |
| /// A weight for an inlining benefit. |
| class Weight { |
| /// How long does it take to execute the scope (either loop or the whole |
| /// function) of the call to inline? The larger it is the less important it |
| /// is to inline. |
| int ScopeLength; |
| |
| /// Represents the loop nest. The larger it is the more important it is to |
| /// inline |
| int LoopWeight; |
| |
| friend class ShortestPathAnalysis; |
| |
| public: |
| Weight(int ScopeLength, int LoopWeight) : ScopeLength(ScopeLength), |
| LoopWeight(LoopWeight) { } |
| |
| Weight(const Weight &RHS, int AdditionalLoopWeight) : |
| Weight(RHS.ScopeLength, RHS.LoopWeight + AdditionalLoopWeight) { } |
| |
| Weight() : ScopeLength(-1), LoopWeight(0) { |
| assert(!isValid()); |
| } |
| |
| bool isValid() const { return ScopeLength >= 0; } |
| |
| /// Updates the \p Benefit by weighting the \p Importance. |
| void updateBenefit(int &Benefit, int Importance) const { |
| assert(isValid()); |
| int newBenefit = 0; |
| |
| // Use some heuristics. The basic idea is: length is bad, loops are good. |
| if (ScopeLength > 320) { |
| newBenefit = Importance; |
| } else if (ScopeLength > 160) { |
| newBenefit = Importance + LoopWeight * 4; |
| } else if (ScopeLength > 80) { |
| newBenefit = Importance + LoopWeight * 8; |
| } else if (ScopeLength > 40) { |
| newBenefit = Importance + LoopWeight * 12; |
| } else if (ScopeLength > 20) { |
| newBenefit = Importance + LoopWeight * 16; |
| } else { |
| newBenefit = Importance + 20 + LoopWeight * 16; |
| } |
| // We don't accumulate the benefit instead we max it. |
| if (newBenefit > Benefit) |
| Benefit = newBenefit; |
| } |
| |
| #ifndef NDEBUG |
| friend raw_ostream &operator<<(raw_ostream &os, const Weight &W) { |
| os << W.LoopWeight << '/' << W.ScopeLength; |
| return os; |
| } |
| #endif |
| }; |
| |
| private: |
| /// The distances for a block in it's scope (either loop or function). |
| struct Distances { |
| /// The shortest distance from the scope entry to the block entry. |
| int DistFromEntry = InitialDist; |
| |
| /// The shortest distance from the block entry to the scope exit. |
| int DistToExit = InitialDist; |
| |
| /// Additional length to account for loop iterations. It's != 0 for loop |
| /// headers (or loop predecessors). |
| int LoopHeaderLength = 0; |
| }; |
| |
| /// Holds the distance information for a block in all it's scopes, i.e. in all |
| /// its containing loops and the function itself. |
| struct BlockInfo { |
| private: |
| /// Distances for all scopes. Element 0 refers to the function, elements |
| /// > 0 to loops. |
| Distances Dists[MaxNumLoopLevels]; |
| public: |
| /// The length of the block itself. |
| int Length = 0; |
| |
| /// Returns the distances for the loop with nesting level \p LoopDepth. |
| Distances &getDistances(int LoopDepth) { |
| assert(LoopDepth >= 0 && LoopDepth < MaxNumLoopLevels); |
| return Dists[LoopDepth]; |
| } |
| |
| /// Returns the length including the LoopHeaderLength for a given |
| /// \p LoopDepth. |
| int getLength(int LoopDepth) { |
| return Length + getDistances(LoopDepth).LoopHeaderLength; |
| } |
| |
| /// Returns the length of the shortest path of this block in the loop with |
| /// nesting level \p LoopDepth. |
| int getScopeLength(int LoopDepth) { |
| const Distances &D = getDistances(LoopDepth); |
| return D.DistFromEntry + D.DistToExit; |
| } |
| }; |
| |
| SILFunction *F; |
| SILLoopInfo *LI; |
| llvm::DenseMap<const SILBasicBlock *, BlockInfo *> BlockInfos; |
| std::vector<BlockInfo> BlockInfoStorage; |
| |
| BlockInfo *getBlockInfo(const SILBasicBlock *BB) { |
| BlockInfo *BI = BlockInfos[BB]; |
| assert(BI); |
| return BI; |
| } |
| |
| // Utility functions to make the template solveDataFlow compilable for a block |
| // list containing references _and_ a list containing pointers. |
| |
| const SILBasicBlock *getBlock(const SILBasicBlock *BB) { return BB; } |
| const SILBasicBlock *getBlock(const SILBasicBlock &BB) { return &BB; } |
| |
| /// Returns the minimum distance from all predecessor blocks of \p BB. |
| int getEntryDistFromPreds(const SILBasicBlock *BB, int LoopDepth); |
| |
| /// Returns the minimum distance from all successor blocks of \p BB. |
| int getExitDistFromSuccs(const SILBasicBlock *BB, int LoopDepth); |
| |
| /// Computes the distances by solving the dataflow problem for all \p Blocks |
| /// in a scope. |
| template <typename BlockList> |
| void solveDataFlow(const BlockList &Blocks, int LoopDepth) { |
| bool Changed = false; |
| |
| // Compute the distances from the entry block. |
| do { |
| Changed = false; |
| for (auto &Block : Blocks) { |
| const SILBasicBlock *BB = getBlock(Block); |
| BlockInfo *BBInfo = getBlockInfo(BB); |
| int DistFromEntry = getEntryDistFromPreds(BB, LoopDepth); |
| Distances &BBDists = BBInfo->getDistances(LoopDepth); |
| if (DistFromEntry < BBDists.DistFromEntry) { |
| BBDists.DistFromEntry = DistFromEntry; |
| Changed = true; |
| } |
| } |
| } while (Changed); |
| |
| // Compute the distances to the exit block. |
| do { |
| Changed = false; |
| for (auto &Block : reverse(Blocks)) { |
| const SILBasicBlock *BB = getBlock(Block); |
| BlockInfo *BBInfo = getBlockInfo(BB); |
| int DistToExit = |
| getExitDistFromSuccs(BB, LoopDepth) + BBInfo->getLength(LoopDepth); |
| Distances &BBDists = BBInfo->getDistances(LoopDepth); |
| if (DistToExit < BBDists.DistToExit) { |
| BBDists.DistToExit = DistToExit; |
| Changed = true; |
| } |
| } |
| } while (Changed); |
| } |
| |
| /// Analyze \p Loop and all its inner loops. |
| void analyzeLoopsRecursively(SILLoop *Loop, int LoopDepth); |
| |
| void printFunction(llvm::raw_ostream &OS); |
| |
| void printLoop(llvm::raw_ostream &OS, SILLoop *Loop, int LoopDepth); |
| |
| void printBlockInfo(llvm::raw_ostream &OS, SILBasicBlock *BB, int LoopDepth); |
| |
| public: |
| ShortestPathAnalysis(SILFunction *F, SILLoopInfo *LI) : F(F), LI(LI) { } |
| |
| bool isValid() const { return !BlockInfos.empty(); } |
| |
| /// Compute the distances. The function \p getApplyLength returns the length |
| /// of a function call. |
| template <typename Func> |
| void analyze(ColdBlockInfo &CBI, Func getApplyLength) { |
| assert(!isValid()); |
| |
| BlockInfoStorage.resize(F->size()); |
| |
| // First step: compute the length of the blocks. |
| unsigned BlockIdx = 0; |
| for (SILBasicBlock &BB : *F) { |
| int Length = 0; |
| if (CBI.isCold(&BB)) { |
| Length = ColdBlockLength; |
| } else { |
| for (SILInstruction &I : BB) { |
| if (auto FAS = FullApplySite::isa(&I)) { |
| Length += getApplyLength(FAS); |
| } else { |
| Length += (int)instructionInlineCost(I); |
| } |
| } |
| } |
| BlockInfo *BBInfo = &BlockInfoStorage[BlockIdx++]; |
| BlockInfos[&BB] = BBInfo; |
| BBInfo->Length = Length; |
| |
| // Initialize the distances for the entry and exit blocks, used when |
| // computing the distances for the function itself. |
| if (&BB == &F->front()) |
| BBInfo->getDistances(0).DistFromEntry = 0; |
| |
| if (isa<ReturnInst>(BB.getTerminator())) |
| BBInfo->getDistances(0).DistToExit = Length; |
| else if (isa<ThrowInst>(BB.getTerminator())) |
| BBInfo->getDistances(0).DistToExit = Length + ColdBlockLength; |
| } |
| // Compute the distances for all loops in the function. |
| for (SILLoop *Loop : *LI) { |
| analyzeLoopsRecursively(Loop, 1); |
| } |
| // Compute the distances for the function itself. |
| solveDataFlow(F->getBlocks(), 0); |
| } |
| |
| /// Returns the length of the shortest path of the block \p BB in the loop |
| /// with nesting level \p LoopDepth. If \p LoopDepth is 0 ir returns the |
| /// shortest path in the function. |
| int getScopeLength(SILBasicBlock *BB, int LoopDepth) { |
| assert(BB->getParent() == F); |
| if (LoopDepth >= MaxNumLoopLevels) |
| LoopDepth = MaxNumLoopLevels - 1; |
| return getBlockInfo(BB)->getScopeLength(LoopDepth); |
| } |
| |
| /// Returns the weight of block \p BB also considering the \p CallerWeight |
| /// which is the weight of the call site's block in the caller. |
| Weight getWeight(SILBasicBlock *BB, Weight CallerWeight); |
| |
| void dump(); |
| }; |
| |
| int ShortestPathAnalysis::getEntryDistFromPreds(const SILBasicBlock *BB, |
| int LoopDepth) { |
| int MinDist = InitialDist; |
| for (SILBasicBlock *Pred : BB->getPreds()) { |
| BlockInfo *PredInfo = getBlockInfo(Pred); |
| Distances &PDists = PredInfo->getDistances(LoopDepth); |
| int DistFromEntry = PDists.DistFromEntry + PredInfo->Length + |
| PDists.LoopHeaderLength; |
| assert(DistFromEntry >= 0); |
| if (DistFromEntry < MinDist) |
| MinDist = DistFromEntry; |
| } |
| return MinDist; |
| } |
| |
| int ShortestPathAnalysis::getExitDistFromSuccs(const SILBasicBlock *BB, |
| int LoopDepth) { |
| int MinDist = InitialDist; |
| for (const SILSuccessor &Succ : BB->getSuccessors()) { |
| BlockInfo *SuccInfo = getBlockInfo(Succ); |
| Distances &SDists = SuccInfo->getDistances(LoopDepth); |
| if (SDists.DistToExit < MinDist) |
| MinDist = SDists.DistToExit; |
| } |
| return MinDist; |
| } |
| |
| /// Detect an edge from the loop pre-header's predecessor to the loop exit |
| /// block. Such an edge "short-cuts" a loop if it is never iterated. But usually |
| /// it is the less frequent case and we want to ignore it. |
| /// E.g. it handles the case of N==0 for |
| /// for i in 0..<N { ... } |
| /// If the \p Loop has such an edge the source block of this edge is returned, |
| /// which is the predecessor of the loop pre-header. |
| static SILBasicBlock *detectLoopBypassPreheader(SILLoop *Loop) { |
| SILBasicBlock *Pred = Loop->getLoopPreheader(); |
| if (!Pred) |
| return nullptr; |
| |
| SILBasicBlock *PredPred = Pred->getSinglePredecessor(); |
| if (!PredPred) |
| return nullptr; |
| |
| auto *CBR = dyn_cast<CondBranchInst>(PredPred->getTerminator()); |
| if (!CBR) |
| return nullptr; |
| |
| SILBasicBlock *Succ = (CBR->getTrueBB() == Pred ? CBR->getFalseBB() : |
| CBR->getTrueBB()); |
| |
| for (SILBasicBlock *PredOfSucc : Succ->getPreds()) { |
| SILBasicBlock *Exiting = PredOfSucc->getSinglePredecessor(); |
| if (!Exiting) |
| Exiting = PredOfSucc; |
| if (Loop->contains(Exiting)) |
| return PredPred; |
| } |
| return nullptr; |
| } |
| |
| void ShortestPathAnalysis::analyzeLoopsRecursively(SILLoop *Loop, int LoopDepth) { |
| if (LoopDepth >= MaxNumLoopLevels) |
| return; |
| |
| // First dive into the inner loops. |
| for (SILLoop *SubLoop : Loop->getSubLoops()) { |
| analyzeLoopsRecursively(SubLoop, LoopDepth + 1); |
| } |
| |
| BlockInfo *HeaderInfo = getBlockInfo(Loop->getHeader()); |
| Distances &HeaderDists = HeaderInfo->getDistances(LoopDepth); |
| |
| // Initial values for the entry (== header) and exit-predecessor (== header as |
| // well). |
| HeaderDists.DistFromEntry = 0; |
| HeaderDists.DistToExit = 0; |
| |
| solveDataFlow(Loop->getBlocks(), LoopDepth); |
| |
| int LoopLength = getExitDistFromSuccs(Loop->getHeader(), LoopDepth) + |
| HeaderInfo->getLength(LoopDepth); |
| HeaderDists.DistToExit = LoopLength; |
| |
| // If there is a loop bypass edge, add the loop length to the loop pre-pre- |
| // header instead to the header. This actually let us ignore the loop bypass |
| // edge in the length calculation for the loop's parent scope. |
| if (SILBasicBlock *Bypass = detectLoopBypassPreheader(Loop)) |
| HeaderInfo = getBlockInfo(Bypass); |
| |
| // Add the full loop length (= assumed-iteration-count * length) to the loop |
| // header so that it is considered in the parent scope. |
| HeaderInfo->getDistances(LoopDepth - 1).LoopHeaderLength = |
| LoopCount * LoopLength; |
| } |
| |
| ShortestPathAnalysis::Weight ShortestPathAnalysis:: |
| getWeight(SILBasicBlock *BB, Weight CallerWeight) { |
| assert(BB->getParent() == F); |
| |
| SILLoop *Loop = LI->getLoopFor(BB); |
| if (!Loop) { |
| // We are not in a loop. So just account the length of our function scope |
| // in to the length of the CallerWeight. |
| return Weight(CallerWeight.ScopeLength + getScopeLength(BB, 0), |
| CallerWeight.LoopWeight); |
| } |
| int LoopDepth = Loop->getLoopDepth(); |
| // Deal with the corner case of having more than 4 nested loops. |
| while (LoopDepth >= MaxNumLoopLevels) { |
| --LoopDepth; |
| Loop = Loop->getParentLoop(); |
| } |
| Weight W(getScopeLength(BB, LoopDepth), SingleLoopWeight); |
| |
| // Add weights for all the loops BB is in. |
| while (Loop) { |
| assert(LoopDepth > 0); |
| BlockInfo *HeaderInfo = getBlockInfo(Loop->getHeader()); |
| int InnerLoopLength = HeaderInfo->getScopeLength(LoopDepth) * |
| ShortestPathAnalysis::LoopCount; |
| int OuterLoopWeight = SingleLoopWeight; |
| int OuterScopeLength = HeaderInfo->getScopeLength(LoopDepth - 1); |
| |
| // Reaching the outermost loop, we use the CallerWeight to get the outer |
| // length+loopweight. |
| if (LoopDepth == 1) { |
| // If the apply in the caller is not in a significant loop, just stop with |
| // what we have now. |
| if (CallerWeight.LoopWeight < 4) |
| return W; |
| |
| // If this function is part of the caller's scope length take the caller's |
| // scope length. Note: this is not the case e.g. if the apply is in a |
| // then-branch of an if-then-else in the caller and the else-branch is |
| // the short path. |
| if (CallerWeight.ScopeLength > OuterScopeLength) |
| OuterScopeLength = CallerWeight.ScopeLength; |
| OuterLoopWeight = CallerWeight.LoopWeight; |
| } |
| assert(OuterScopeLength >= InnerLoopLength); |
| |
| // If the current loop is only a small part of its outer loop, we don't |
| // take the outer loop that much into account. Only if the current loop is |
| // actually the "main part" in the outer loop we add the full loop weight |
| // for the outer loop. |
| if (OuterScopeLength < InnerLoopLength * 2) { |
| W.LoopWeight += OuterLoopWeight - 1; |
| } else if (OuterScopeLength < InnerLoopLength * 3) { |
| W.LoopWeight += OuterLoopWeight - 2; |
| } else if (OuterScopeLength < InnerLoopLength * 4) { |
| W.LoopWeight += OuterLoopWeight - 3; |
| } else { |
| return W; |
| } |
| --LoopDepth; |
| Loop = Loop->getParentLoop(); |
| } |
| assert(LoopDepth == 0); |
| return W; |
| } |
| |
| void ShortestPathAnalysis::dump() { |
| printFunction(llvm::errs()); |
| } |
| |
| void ShortestPathAnalysis::printFunction(llvm::raw_ostream &OS) { |
| OS << "SPA @" << F->getName() << "\n"; |
| for (SILBasicBlock &BB : *F) { |
| printBlockInfo(OS, &BB, 0); |
| } |
| for (SILLoop *Loop : *LI) { |
| printLoop(OS, Loop, 1); |
| } |
| } |
| |
| void ShortestPathAnalysis::printLoop(llvm::raw_ostream &OS, SILLoop *Loop, |
| int LoopDepth) { |
| if (LoopDepth >= MaxNumLoopLevels) |
| return; |
| assert(LoopDepth == (int)Loop->getLoopDepth()); |
| |
| OS << "Loop bb" << Loop->getHeader()->getDebugID() << ":\n"; |
| for (SILBasicBlock *BB : Loop->getBlocks()) { |
| printBlockInfo(OS, BB, LoopDepth); |
| } |
| for (SILLoop *SubLoop : Loop->getSubLoops()) { |
| printLoop(OS, SubLoop, LoopDepth + 1); |
| } |
| } |
| |
| void ShortestPathAnalysis::printBlockInfo(llvm::raw_ostream &OS, |
| SILBasicBlock *BB, int LoopDepth) { |
| BlockInfo *BBInfo = getBlockInfo(BB); |
| Distances &D = BBInfo->getDistances(LoopDepth); |
| OS << " bb" << BB->getDebugID() << ": length=" << BBInfo->Length << '+' |
| << D.LoopHeaderLength << ", d-entry=" << D.DistFromEntry |
| << ", d-exit=" << D.DistToExit << '\n'; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Performance Inliner |
| //===----------------------------------------------------------------------===// |
| |
| namespace { |
| |
| // Controls the decision to inline functions with @_semantics, @effect and |
| // global_init attributes. |
| enum class InlineSelection { |
| Everything, |
| NoGlobalInit, // and no availability semantics calls |
| NoSemanticsAndGlobalInit |
| }; |
| |
| using Weight = ShortestPathAnalysis::Weight; |
| |
| class SILPerformanceInliner { |
| /// Specifies which functions not to inline, based on @_semantics and |
| /// global_init attributes. |
| InlineSelection WhatToInline; |
| |
| DominanceAnalysis *DA; |
| SILLoopAnalysis *LA; |
| |
| // For keys of SILFunction and SILLoop. |
| llvm::DenseMap<SILFunction *, ShortestPathAnalysis *> SPAs; |
| llvm::SpecificBumpPtrAllocator<ShortestPathAnalysis> SPAAllocator; |
| |
| ColdBlockInfo CBI; |
| |
| /// The following constants define the cost model for inlining. Some constants |
| /// are also defined in ShortestPathAnalysis. |
| enum { |
| /// The base value for every call: it represents the benefit of removing the |
| /// call overhead itself. |
| RemovedCallBenefit = 20, |
| |
| /// The benefit if the operand of an apply gets constant, e.g. if a closure |
| /// is passed to an apply instruction in the callee. |
| RemovedClosureBenefit = RemovedCallBenefit + 50, |
| |
| /// The benefit if a load can (probably) eliminated because it loads from |
| /// a stack location in the caller. |
| RemovedLoadBenefit = RemovedCallBenefit + 5, |
| |
| /// The benefit if a store can (probably) eliminated because it stores to |
| /// a stack location in the caller. |
| RemovedStoreBenefit = RemovedCallBenefit + 10, |
| |
| /// The benefit if the condition of a terminator instruction gets constant |
| /// due to inlining. |
| RemovedTerminatorBenefit = RemovedCallBenefit + 10, |
| |
| /// The benefit if a retain/release can (probably) be eliminated after |
| /// inlining. |
| RefCountBenefit = RemovedCallBenefit + 20, |
| |
| /// The benefit of a onFastPath builtin. |
| FastPathBuiltinBenefit = RemovedCallBenefit + 40, |
| |
| /// Approximately up to this cost level a function can be inlined without |
| /// increasing the code size. |
| TrivialFunctionThreshold = 18, |
| |
| /// Configuration for the caller block limit. |
| BlockLimitDenominator = 10000, |
| |
| /// The assumed execution length of a function call. |
| DefaultApplyLength = 10 |
| }; |
| |
| #ifndef NDEBUG |
| SILFunction *LastPrintedCaller = nullptr; |
| void dumpCaller(SILFunction *Caller) { |
| if (Caller != LastPrintedCaller) { |
| llvm::dbgs() << "\nInline into caller: " << Caller->getName() << '\n'; |
| LastPrintedCaller = Caller; |
| } |
| } |
| #endif |
| |
| ShortestPathAnalysis *getSPA(SILFunction *F, SILLoopInfo *LI) { |
| ShortestPathAnalysis *&SPA = SPAs[F]; |
| if (!SPA) { |
| SPA = new (SPAAllocator.Allocate()) ShortestPathAnalysis(F, LI); |
| } |
| return SPA; |
| } |
| |
| SILFunction *getEligibleFunction(FullApplySite AI); |
| |
| bool isProfitableToInline(FullApplySite AI, |
| Weight CallerWeight, |
| ConstantTracker &constTracker, |
| int &NumCallerBlocks); |
| |
| bool isProfitableInColdBlock(FullApplySite AI, SILFunction *Callee); |
| |
| void visitColdBlocks(SmallVectorImpl<FullApplySite> &AppliesToInline, |
| SILBasicBlock *root, DominanceInfo *DT); |
| |
| void collectAppliesToInline(SILFunction *Caller, |
| SmallVectorImpl<FullApplySite> &Applies); |
| |
| public: |
| SILPerformanceInliner(InlineSelection WhatToInline, DominanceAnalysis *DA, |
| SILLoopAnalysis *LA) |
| : WhatToInline(WhatToInline), DA(DA), LA(LA), CBI(DA) {} |
| |
| bool inlineCallsIntoFunction(SILFunction *F); |
| }; |
| |
| } // namespace |
| |
| // Return true if the callee has self-recursive calls. |
| static bool calleeIsSelfRecursive(SILFunction *Callee) { |
| for (auto &BB : *Callee) |
| for (auto &I : BB) |
| if (auto Apply = FullApplySite::isa(&I)) |
| if (Apply.getReferencedFunction() == Callee) |
| return true; |
| return false; |
| } |
| |
| // Returns the callee of an apply_inst if it is basically inlineable. |
| SILFunction *SILPerformanceInliner::getEligibleFunction(FullApplySite AI) { |
| |
| SILFunction *Callee = AI.getReferencedFunction(); |
| |
| if (!Callee) { |
| return nullptr; |
| } |
| |
| // Don't inline functions that are marked with the @_semantics or @effects |
| // attribute if the inliner is asked not to inline them. |
| if (Callee->hasSemanticsAttrs() || Callee->hasEffectsKind()) { |
| if (WhatToInline == InlineSelection::NoSemanticsAndGlobalInit) { |
| return nullptr; |
| } |
| // The "availability" semantics attribute is treated like global-init. |
| if (Callee->hasSemanticsAttrs() && |
| WhatToInline != InlineSelection::Everything && |
| Callee->hasSemanticsAttrThatStartsWith("availability")) { |
| return nullptr; |
| } |
| } else if (Callee->isGlobalInit()) { |
| if (WhatToInline != InlineSelection::Everything) { |
| return nullptr; |
| } |
| } |
| |
| // We can't inline external declarations. |
| if (Callee->empty() || Callee->isExternalDeclaration()) { |
| return nullptr; |
| } |
| |
| // Explicitly disabled inlining. |
| if (Callee->getInlineStrategy() == NoInline) { |
| return nullptr; |
| } |
| |
| if (!Callee->shouldOptimize()) { |
| return nullptr; |
| } |
| |
| // We don't support this yet. |
| if (AI.hasSubstitutions()) { |
| return nullptr; |
| } |
| |
| SILFunction *Caller = AI.getFunction(); |
| |
| // We don't support inlining a function that binds dynamic self because we |
| // have no mechanism to preserve the original function's local self metadata. |
| if (computeMayBindDynamicSelf(Callee)) { |
| // Check if passed Self is the same as the Self of the caller. |
| // In this case, it is safe to inline because both functions |
| // use the same Self. |
| if (AI.hasSelfArgument() && Caller->hasSelfParam()) { |
| auto CalleeSelf = stripCasts(AI.getSelfArgument()); |
| auto CallerSelf = Caller->getSelfArgument(); |
| if (CalleeSelf != SILValue(CallerSelf)) |
| return nullptr; |
| } else |
| return nullptr; |
| } |
| |
| // Detect self-recursive calls. |
| if (Caller == Callee) { |
| return nullptr; |
| } |
| |
| // A non-fragile function may not be inlined into a fragile function. |
| if (Caller->isFragile() && |
| !Callee->hasValidLinkageForFragileInline()) { |
| if (!Callee->hasValidLinkageForFragileRef()) { |
| llvm::errs() << "caller: " << Caller->getName() << "\n"; |
| llvm::errs() << "callee: " << Callee->getName() << "\n"; |
| llvm_unreachable("Should never be inlining a resilient function into " |
| "a fragile function"); |
| } |
| return nullptr; |
| } |
| |
| // Inlining self-recursive functions into other functions can result |
| // in excessive code duplication since we run the inliner multiple |
| // times in our pipeline |
| if (calleeIsSelfRecursive(Callee)) { |
| return nullptr; |
| } |
| |
| return Callee; |
| } |
| |
| /// Return true if inlining this call site is profitable. |
| bool SILPerformanceInliner::isProfitableToInline(FullApplySite AI, |
| Weight CallerWeight, |
| ConstantTracker &callerTracker, |
| int &NumCallerBlocks) { |
| SILFunction *Callee = AI.getReferencedFunction(); |
| |
| if (Callee->getInlineStrategy() == AlwaysInline) |
| return true; |
| |
| SILLoopInfo *LI = LA->get(Callee); |
| ShortestPathAnalysis *SPA = getSPA(Callee, LI); |
| assert(SPA->isValid()); |
| |
| ConstantTracker constTracker(Callee, &callerTracker, AI); |
| DominanceInfo *DT = DA->get(Callee); |
| SILBasicBlock *CalleeEntry = &Callee->front(); |
| DominanceOrder domOrder(CalleeEntry, DT, Callee->size()); |
| |
| // Calculate the inlining cost of the callee. |
| int CalleeCost = 0; |
| int Benefit = 0; |
| |
| // Start with a base benefit. |
| int BaseBenefit = RemovedCallBenefit; |
| const SILOptions &Opts = Callee->getModule().getOptions(); |
| |
| // For some reason -Ounchecked can accept a higher base benefit without |
| // increasing the code size too much. |
| if (Opts.Optimization == SILOptions::SILOptMode::OptimizeUnchecked) |
| BaseBenefit *= 2; |
| |
| CallerWeight.updateBenefit(Benefit, BaseBenefit); |
| |
| // Go through all blocks of the function, accumulate the cost and find |
| // benefits. |
| while (SILBasicBlock *block = domOrder.getNext()) { |
| constTracker.beginBlock(); |
| Weight BlockW = SPA->getWeight(block, CallerWeight); |
| |
| for (SILInstruction &I : *block) { |
| constTracker.trackInst(&I); |
| |
| CalleeCost += (int)instructionInlineCost(I); |
| |
| if (FullApplySite AI = FullApplySite::isa(&I)) { |
| |
| // Check if the callee is passed as an argument. If so, increase the |
| // threshold, because inlining will (probably) eliminate the closure. |
| SILInstruction *def = constTracker.getDefInCaller(AI.getCallee()); |
| if (def && (isa<FunctionRefInst>(def) || isa<PartialApplyInst>(def))) |
| BlockW.updateBenefit(Benefit, RemovedClosureBenefit); |
| } else if (auto *LI = dyn_cast<LoadInst>(&I)) { |
| // Check if it's a load from a stack location in the caller. Such a load |
| // might be optimized away if inlined. |
| if (constTracker.isStackAddrInCaller(LI->getOperand())) |
| BlockW.updateBenefit(Benefit, RemovedLoadBenefit); |
| } else if (auto *SI = dyn_cast<StoreInst>(&I)) { |
| // Check if it's a store to a stack location in the caller. Such a load |
| // might be optimized away if inlined. |
| if (constTracker.isStackAddrInCaller(SI->getDest())) |
| BlockW.updateBenefit(Benefit, RemovedStoreBenefit); |
| } else if (isa<StrongReleaseInst>(&I) || isa<ReleaseValueInst>(&I)) { |
| SILValue Op = stripCasts(I.getOperand(0)); |
| if (SILArgument *Arg = dyn_cast<SILArgument>(Op)) { |
| if (Arg->isFunctionArg() && Arg->getArgumentConvention() == |
| SILArgumentConvention::Direct_Guaranteed) { |
| BlockW.updateBenefit(Benefit, RefCountBenefit); |
| } |
| } |
| } else if (auto *BI = dyn_cast<BuiltinInst>(&I)) { |
| if (BI->getBuiltinInfo().ID == BuiltinValueKind::OnFastPath) |
| BlockW.updateBenefit(Benefit, FastPathBuiltinBenefit); |
| } |
| } |
| // Don't count costs in blocks which are dead after inlining. |
| SILBasicBlock *takenBlock = constTracker.getTakenBlock(block->getTerminator()); |
| if (takenBlock) { |
| BlockW.updateBenefit(Benefit, RemovedTerminatorBenefit); |
| domOrder.pushChildrenIf(block, [=] (SILBasicBlock *child) { |
| return child->getSinglePredecessor() != block || child == takenBlock; |
| }); |
| } else { |
| domOrder.pushChildren(block); |
| } |
| } |
| |
| if (AI.getFunction()->isThunk()) { |
| // Only inline trivial functions into thunks (which will not increase the |
| // code size). |
| if (CalleeCost > TrivialFunctionThreshold) |
| return false; |
| |
| DEBUG( |
| |
| dumpCaller(AI.getFunction()); |
| llvm::dbgs() << " decision {" << CalleeCost << " into thunk} " << |
| Callee->getName() << '\n'; |
| ); |
| return true; |
| } |
| |
| // We reduce the benefit if the caller is too large. For this we use a |
| // cubic function on the number of caller blocks. This starts to prevent |
| // inlining at about 800 - 1000 caller blocks. |
| int blockMinus = |
| (NumCallerBlocks * NumCallerBlocks) / BlockLimitDenominator * |
| NumCallerBlocks / BlockLimitDenominator; |
| Benefit -= blockMinus; |
| |
| // This is the final inlining decision. |
| if (CalleeCost > Benefit) { |
| return false; |
| } |
| |
| NumCallerBlocks += Callee->size(); |
| |
| DEBUG( |
| dumpCaller(AI.getFunction()); |
| llvm::dbgs() << " decision {c=" << CalleeCost << ", b=" << Benefit << |
| ", l=" << SPA->getScopeLength(CalleeEntry, 0) << |
| ", c-w=" << CallerWeight << ", bb=" << Callee->size() << |
| ", c-bb=" << NumCallerBlocks << "} " << Callee->getName() << '\n'; |
| ); |
| return true; |
| } |
| |
| /// Return true if inlining this call site into a cold block is profitable. |
| bool SILPerformanceInliner::isProfitableInColdBlock(FullApplySite AI, |
| SILFunction *Callee) { |
| if (Callee->getInlineStrategy() == AlwaysInline) |
| return true; |
| |
| int CalleeCost = 0; |
| |
| for (SILBasicBlock &Block : *Callee) { |
| for (SILInstruction &I : Block) { |
| CalleeCost += int(instructionInlineCost(I)); |
| if (CalleeCost > TrivialFunctionThreshold) |
| return false; |
| } |
| } |
| DEBUG( |
| dumpCaller(AI.getFunction()); |
| llvm::dbgs() << " cold decision {" << CalleeCost << "} " << |
| Callee->getName() << '\n'; |
| ); |
| return true; |
| } |
| |
| /// Record additional weight increases. |
| /// |
| /// Why can't we just add the weight when we call isProfitableToInline? Because |
| /// the additional weight is for _another_ function than the current handled |
| /// callee. |
| static void addWeightCorrection(FullApplySite FAS, |
| llvm::DenseMap<FullApplySite, int> &WeightCorrections) { |
| SILFunction *Callee = FAS.getReferencedFunction(); |
| if (Callee && Callee->hasSemanticsAttr("array.uninitialized")) { |
| // We want to inline the argument to an array.uninitialized call, because |
| // this argument is most likely a call to a function which contains the |
| // buffer allocation for the array. It is essential to inline it for stack |
| // promotion of the array buffer. |
| SILValue BufferArg = FAS.getArgument(0); |
| SILValue Base = stripValueProjections(stripCasts(BufferArg)); |
| if (auto BaseApply = FullApplySite::isa(Base)) |
| WeightCorrections[BaseApply] += 6; |
| } |
| } |
| |
| void SILPerformanceInliner::collectAppliesToInline( |
| SILFunction *Caller, SmallVectorImpl<FullApplySite> &Applies) { |
| DominanceInfo *DT = DA->get(Caller); |
| SILLoopInfo *LI = LA->get(Caller); |
| |
| llvm::DenseMap<FullApplySite, int> WeightCorrections; |
| |
| // Compute the shortest-path analysis for the caller. |
| ShortestPathAnalysis *SPA = getSPA(Caller, LI); |
| SPA->analyze(CBI, [&](FullApplySite FAS) -> int { |
| |
| // This closure returns the length of a called function. |
| |
| // At this occasion we record additional weight increases. |
| addWeightCorrection(FAS, WeightCorrections); |
| |
| if (SILFunction *Callee = getEligibleFunction(FAS)) { |
| // Compute the shortest-path analysis for the callee. |
| SILLoopInfo *CalleeLI = LA->get(Callee); |
| ShortestPathAnalysis *CalleeSPA = getSPA(Callee, CalleeLI); |
| if (!CalleeSPA->isValid()) { |
| CalleeSPA->analyze(CBI, [](FullApplySite FAS) { |
| // We don't compute SPA for another call-level. Functions called from |
| // the callee are assumed to have DefaultApplyLength. |
| return DefaultApplyLength; |
| }); |
| } |
| int CalleeLength = CalleeSPA->getScopeLength(&Callee->front(), 0); |
| // Just in case the callee is a noreturn function. |
| if (CalleeLength >= ShortestPathAnalysis::InitialDist) |
| return DefaultApplyLength; |
| return CalleeLength; |
| } |
| // Some unknown function. |
| return DefaultApplyLength; |
| }); |
| |
| #ifndef NDEBUG |
| if (PrintShortestPathInfo) { |
| SPA->dump(); |
| } |
| #endif |
| |
| ConstantTracker constTracker(Caller); |
| DominanceOrder domOrder(&Caller->front(), DT, Caller->size()); |
| int NumCallerBlocks = (int)Caller->size(); |
| |
| // Go through all instructions and find candidates for inlining. |
| // We do this in dominance order for the constTracker. |
| SmallVector<FullApplySite, 8> InitialCandidates; |
| while (SILBasicBlock *block = domOrder.getNext()) { |
| constTracker.beginBlock(); |
| Weight BlockWeight; |
| |
| for (auto I = block->begin(), E = block->end(); I != E; ++I) { |
| constTracker.trackInst(&*I); |
| |
| if (!FullApplySite::isa(&*I)) |
| continue; |
| |
| FullApplySite AI = FullApplySite(&*I); |
| |
| auto *Callee = getEligibleFunction(AI); |
| if (Callee) { |
| if (!BlockWeight.isValid()) |
| BlockWeight = SPA->getWeight(block, Weight(0, 0)); |
| |
| // The actual weight including a possible weight correction. |
| Weight W(BlockWeight, WeightCorrections.lookup(AI)); |
| |
| if (isProfitableToInline(AI, W, constTracker, NumCallerBlocks)) |
| InitialCandidates.push_back(AI); |
| } |
| } |
| domOrder.pushChildrenIf(block, [&] (SILBasicBlock *child) { |
| if (CBI.isSlowPath(block, child)) { |
| // Handle cold blocks separately. |
| visitColdBlocks(InitialCandidates, child, DT); |
| return false; |
| } |
| return true; |
| }); |
| } |
| |
| // Calculate how many times a callee is called from this caller. |
| llvm::DenseMap<SILFunction *, unsigned> CalleeCount; |
| for (auto AI : InitialCandidates) { |
| SILFunction *Callee = AI.getReferencedFunction(); |
| assert(Callee && "apply_inst does not have a direct callee anymore"); |
| CalleeCount[Callee]++; |
| } |
| |
| // Now copy each candidate callee that has a small enough number of |
| // call sites into the final set of call sites. |
| for (auto AI : InitialCandidates) { |
| SILFunction *Callee = AI.getReferencedFunction(); |
| assert(Callee && "apply_inst does not have a direct callee anymore"); |
| |
| const unsigned CallsToCalleeThreshold = 1024; |
| if (CalleeCount[Callee] <= CallsToCalleeThreshold) |
| Applies.push_back(AI); |
| } |
| } |
| |
| /// \brief Attempt to inline all calls smaller than our threshold. |
| /// returns True if a function was inlined. |
| bool SILPerformanceInliner::inlineCallsIntoFunction(SILFunction *Caller) { |
| // Don't optimize functions that are marked with the opt.never attribute. |
| if (!Caller->shouldOptimize()) |
| return false; |
| |
| // First step: collect all the functions we want to inline. We |
| // don't change anything yet so that the dominator information |
| // remains valid. |
| SmallVector<FullApplySite, 8> AppliesToInline; |
| collectAppliesToInline(Caller, AppliesToInline); |
| |
| if (AppliesToInline.empty()) |
| return false; |
| |
| // Second step: do the actual inlining. |
| for (auto AI : AppliesToInline) { |
| SILFunction *Callee = AI.getReferencedFunction(); |
| assert(Callee && "apply_inst does not have a direct callee anymore"); |
| |
| if (!Callee->shouldOptimize()) { |
| continue; |
| } |
| |
| SmallVector<SILValue, 8> Args; |
| for (const auto &Arg : AI.getArguments()) |
| Args.push_back(Arg); |
| |
| DEBUG( |
| dumpCaller(Caller); |
| llvm::dbgs() << " inline [" << Callee->size() << "->" << |
| Caller->size() << "] " << Callee->getName() << "\n"; |
| ); |
| |
| SILOpenedArchetypesTracker OpenedArchetypesTracker(*Caller); |
| Caller->getModule().registerDeleteNotificationHandler(&OpenedArchetypesTracker); |
| // The callee only needs to know about opened archetypes used in |
| // the substitution list. |
| OpenedArchetypesTracker.registerUsedOpenedArchetypes(AI.getInstruction()); |
| |
| // Notice that we will skip all of the newly inlined ApplyInsts. That's |
| // okay because we will visit them in our next invocation of the inliner. |
| TypeSubstitutionMap ContextSubs; |
| SILInliner Inliner(*Caller, *Callee, |
| SILInliner::InlineKind::PerformanceInline, ContextSubs, |
| AI.getSubstitutions(), |
| OpenedArchetypesTracker); |
| |
| auto Success = Inliner.inlineFunction(AI, Args); |
| (void) Success; |
| // We've already determined we should be able to inline this, so |
| // we expect it to have happened. |
| assert(Success && "Expected inliner to inline this function!"); |
| |
| recursivelyDeleteTriviallyDeadInstructions(AI.getInstruction(), true); |
| |
| NumFunctionsInlined++; |
| } |
| |
| return true; |
| } |
| |
| // Find functions in cold blocks which are forced to be inlined. |
| // All other functions are not inlined in cold blocks. |
| void SILPerformanceInliner::visitColdBlocks( |
| SmallVectorImpl<FullApplySite> &AppliesToInline, SILBasicBlock *Root, |
| DominanceInfo *DT) { |
| DominanceOrder domOrder(Root, DT); |
| while (SILBasicBlock *block = domOrder.getNext()) { |
| for (SILInstruction &I : *block) { |
| ApplyInst *AI = dyn_cast<ApplyInst>(&I); |
| if (!AI) |
| continue; |
| |
| auto *Callee = getEligibleFunction(AI); |
| if (Callee && isProfitableInColdBlock(AI, Callee)) { |
| AppliesToInline.push_back(AI); |
| } |
| } |
| domOrder.pushChildren(block); |
| } |
| } |
| |
| |
| //===----------------------------------------------------------------------===// |
| // Performance Inliner Pass |
| //===----------------------------------------------------------------------===// |
| |
| namespace { |
| class SILPerformanceInlinerPass : public SILFunctionTransform { |
| /// Specifies which functions not to inline, based on @_semantics and |
| /// global_init attributes. |
| InlineSelection WhatToInline; |
| std::string PassName; |
| |
| public: |
| SILPerformanceInlinerPass(InlineSelection WhatToInline, StringRef LevelName): |
| WhatToInline(WhatToInline), PassName(LevelName) { |
| PassName.append(" Performance Inliner"); |
| } |
| |
| void run() override { |
| DominanceAnalysis *DA = PM->getAnalysis<DominanceAnalysis>(); |
| SILLoopAnalysis *LA = PM->getAnalysis<SILLoopAnalysis>(); |
| |
| if (getOptions().InlineThreshold == 0) { |
| return; |
| } |
| |
| SILPerformanceInliner Inliner(WhatToInline, DA, LA); |
| |
| assert(getFunction()->isDefinition() && |
| "Expected only functions with bodies!"); |
| |
| // Inline things into this function, and if we do so invalidate |
| // analyses for this function and restart the pipeline so that we |
| // can further optimize this function before attempting to inline |
| // in it again. |
| if (Inliner.inlineCallsIntoFunction(getFunction())) { |
| invalidateAnalysis(SILAnalysis::InvalidationKind::FunctionBody); |
| restartPassPipeline(); |
| } |
| } |
| |
| StringRef getName() override { return PassName; } |
| }; |
| } // end anonymous namespace |
| |
| /// Create an inliner pass that does not inline functions that are marked with |
| /// the @_semantics, @effects or global_init attributes. |
| SILTransform *swift::createEarlyInliner() { |
| return new SILPerformanceInlinerPass( |
| InlineSelection::NoSemanticsAndGlobalInit, "Early"); |
| } |
| |
| /// Create an inliner pass that does not inline functions that are marked with |
| /// the global_init attribute or have an "availability" semantics attribute. |
| SILTransform *swift::createPerfInliner() { |
| return new SILPerformanceInlinerPass(InlineSelection::NoGlobalInit, "Middle"); |
| } |
| |
| /// Create an inliner pass that inlines all functions that are marked with |
| /// the @_semantics, @effects or global_init attributes. |
| SILTransform *swift::createLateInliner() { |
| return new SILPerformanceInlinerPass(InlineSelection::Everything, "Late"); |
| } |