| //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This pass implements the Bottom Up SLP vectorizer. It detects consecutive |
| // stores that can be put together into vector-stores. Next, it attempts to |
| // construct vectorizable tree using the use-def chains. If a profitable tree |
| // was found, the SLP vectorizer performs vectorization on the tree. |
| // |
| // The pass is inspired by the work described in the paper: |
| // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Transforms/Vectorize/SLPVectorizer.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/DenseSet.h" |
| #include "llvm/ADT/Optional.h" |
| #include "llvm/ADT/PostOrderIterator.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/ADT/SmallBitVector.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/ADT/SmallString.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/iterator.h" |
| #include "llvm/ADT/iterator_range.h" |
| #include "llvm/Analysis/AliasAnalysis.h" |
| #include "llvm/Analysis/CodeMetrics.h" |
| #include "llvm/Analysis/DemandedBits.h" |
| #include "llvm/Analysis/GlobalsModRef.h" |
| #include "llvm/Analysis/LoopAccessAnalysis.h" |
| #include "llvm/Analysis/LoopInfo.h" |
| #include "llvm/Analysis/MemoryLocation.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/Analysis/ScalarEvolution.h" |
| #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| #include "llvm/Analysis/TargetLibraryInfo.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/Analysis/ValueTracking.h" |
| #include "llvm/Analysis/VectorUtils.h" |
| #include "llvm/Analysis/AssumptionCache.h" |
| #include "llvm/IR/Attributes.h" |
| #include "llvm/IR/BasicBlock.h" |
| #include "llvm/IR/Constant.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/DebugLoc.h" |
| #include "llvm/IR/DerivedTypes.h" |
| #include "llvm/IR/Dominators.h" |
| #include "llvm/IR/Function.h" |
| #include "llvm/IR/IRBuilder.h" |
| #include "llvm/IR/InstrTypes.h" |
| #include "llvm/IR/Instruction.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/IR/Intrinsics.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/NoFolder.h" |
| #include "llvm/IR/Operator.h" |
| #include "llvm/IR/PatternMatch.h" |
| #include "llvm/IR/Type.h" |
| #include "llvm/IR/Use.h" |
| #include "llvm/IR/User.h" |
| #include "llvm/IR/Value.h" |
| #include "llvm/IR/ValueHandle.h" |
| #include "llvm/IR/Verifier.h" |
| #include "llvm/InitializePasses.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/Casting.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Compiler.h" |
| #include "llvm/Support/DOTGraphTraits.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/GraphWriter.h" |
| #include "llvm/Support/KnownBits.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Transforms/Utils/InjectTLIMappings.h" |
| #include "llvm/Transforms/Utils/LoopUtils.h" |
| #include "llvm/Transforms/Vectorize.h" |
| #include <algorithm> |
| #include <cassert> |
| #include <cstdint> |
| #include <iterator> |
| #include <memory> |
| #include <set> |
| #include <string> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
| |
| using namespace llvm; |
| using namespace llvm::PatternMatch; |
| using namespace slpvectorizer; |
| |
| #define SV_NAME "slp-vectorizer" |
| #define DEBUG_TYPE "SLP" |
| |
| STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); |
| |
| cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, |
| cl::desc("Run the SLP vectorization passes")); |
| |
| static cl::opt<int> |
| SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, |
| cl::desc("Only vectorize if you gain more than this " |
| "number ")); |
| |
| static cl::opt<bool> |
| ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, |
| cl::desc("Attempt to vectorize horizontal reductions")); |
| |
| static cl::opt<bool> ShouldStartVectorizeHorAtStore( |
| "slp-vectorize-hor-store", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Attempt to vectorize horizontal reductions feeding into a store")); |
| |
| static cl::opt<int> |
| MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, |
| cl::desc("Attempt to vectorize for this register size in bits")); |
| |
| static cl::opt<int> |
| MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, |
| cl::desc("Maximum depth of the lookup for consecutive stores.")); |
| |
| /// Limits the size of scheduling regions in a block. |
| /// It avoid long compile times for _very_ large blocks where vector |
| /// instructions are spread over a wide range. |
| /// This limit is way higher than needed by real-world functions. |
| static cl::opt<int> |
| ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, |
| cl::desc("Limit the size of the SLP scheduling region per block")); |
| |
| static cl::opt<int> MinVectorRegSizeOption( |
| "slp-min-reg-size", cl::init(128), cl::Hidden, |
| cl::desc("Attempt to vectorize for this register size in bits")); |
| |
| static cl::opt<unsigned> RecursionMaxDepth( |
| "slp-recursion-max-depth", cl::init(12), cl::Hidden, |
| cl::desc("Limit the recursion depth when building a vectorizable tree")); |
| |
| static cl::opt<unsigned> MinTreeSize( |
| "slp-min-tree-size", cl::init(3), cl::Hidden, |
| cl::desc("Only vectorize small trees if they are fully vectorizable")); |
| |
| // The maximum depth that the look-ahead score heuristic will explore. |
| // The higher this value, the higher the compilation time overhead. |
| static cl::opt<int> LookAheadMaxDepth( |
| "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, |
| cl::desc("The maximum look-ahead depth for operand reordering scores")); |
| |
| // The Look-ahead heuristic goes through the users of the bundle to calculate |
| // the users cost in getExternalUsesCost(). To avoid compilation time increase |
| // we limit the number of users visited to this value. |
| static cl::opt<unsigned> LookAheadUsersBudget( |
| "slp-look-ahead-users-budget", cl::init(2), cl::Hidden, |
| cl::desc("The maximum number of users to visit while visiting the " |
| "predecessors. This prevents compilation time increase.")); |
| |
| static cl::opt<bool> |
| ViewSLPTree("view-slp-tree", cl::Hidden, |
| cl::desc("Display the SLP trees with Graphviz")); |
| |
| // Limit the number of alias checks. The limit is chosen so that |
| // it has no negative effect on the llvm benchmarks. |
| static const unsigned AliasedCheckLimit = 10; |
| |
| // Another limit for the alias checks: The maximum distance between load/store |
| // instructions where alias checks are done. |
| // This limit is useful for very large basic blocks. |
| static const unsigned MaxMemDepDistance = 160; |
| |
| /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling |
| /// regions to be handled. |
| static const int MinScheduleRegionSize = 16; |
| |
| /// Predicate for the element types that the SLP vectorizer supports. |
| /// |
| /// The most important thing to filter here are types which are invalid in LLVM |
| /// vectors. We also filter target specific types which have absolutely no |
| /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just |
| /// avoids spending time checking the cost model and realizing that they will |
| /// be inevitably scalarized. |
| static bool isValidElementType(Type *Ty) { |
| return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && |
| !Ty->isPPC_FP128Ty(); |
| } |
| |
| /// \returns true if all of the instructions in \p VL are in the same block or |
| /// false otherwise. |
| static bool allSameBlock(ArrayRef<Value *> VL) { |
| Instruction *I0 = dyn_cast<Instruction>(VL[0]); |
| if (!I0) |
| return false; |
| BasicBlock *BB = I0->getParent(); |
| for (int i = 1, e = VL.size(); i < e; i++) { |
| Instruction *I = dyn_cast<Instruction>(VL[i]); |
| if (!I) |
| return false; |
| |
| if (BB != I->getParent()) |
| return false; |
| } |
| return true; |
| } |
| |
| /// \returns True if all of the values in \p VL are constants (but not |
| /// globals/constant expressions). |
| static bool allConstant(ArrayRef<Value *> VL) { |
| // Constant expressions and globals can't be vectorized like normal integer/FP |
| // constants. |
| for (Value *i : VL) |
| if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i)) |
| return false; |
| return true; |
| } |
| |
| /// \returns True if all of the values in \p VL are identical. |
| static bool isSplat(ArrayRef<Value *> VL) { |
| for (unsigned i = 1, e = VL.size(); i < e; ++i) |
| if (VL[i] != VL[0]) |
| return false; |
| return true; |
| } |
| |
| /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. |
| static bool isCommutative(Instruction *I) { |
| if (auto *Cmp = dyn_cast<CmpInst>(I)) |
| return Cmp->isCommutative(); |
| if (auto *BO = dyn_cast<BinaryOperator>(I)) |
| return BO->isCommutative(); |
| // TODO: This should check for generic Instruction::isCommutative(), but |
| // we need to confirm that the caller code correctly handles Intrinsics |
| // for example (does not have 2 operands). |
| return false; |
| } |
| |
| /// Checks if the vector of instructions can be represented as a shuffle, like: |
| /// %x0 = extractelement <4 x i8> %x, i32 0 |
| /// %x3 = extractelement <4 x i8> %x, i32 3 |
| /// %y1 = extractelement <4 x i8> %y, i32 1 |
| /// %y2 = extractelement <4 x i8> %y, i32 2 |
| /// %x0x0 = mul i8 %x0, %x0 |
| /// %x3x3 = mul i8 %x3, %x3 |
| /// %y1y1 = mul i8 %y1, %y1 |
| /// %y2y2 = mul i8 %y2, %y2 |
| /// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0 |
| /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 |
| /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 |
| /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 |
| /// ret <4 x i8> %ins4 |
| /// can be transformed into: |
| /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, |
| /// i32 6> |
| /// %2 = mul <4 x i8> %1, %1 |
| /// ret <4 x i8> %2 |
| /// We convert this initially to something like: |
| /// %x0 = extractelement <4 x i8> %x, i32 0 |
| /// %x3 = extractelement <4 x i8> %x, i32 3 |
| /// %y1 = extractelement <4 x i8> %y, i32 1 |
| /// %y2 = extractelement <4 x i8> %y, i32 2 |
| /// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0 |
| /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 |
| /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 |
| /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 |
| /// %5 = mul <4 x i8> %4, %4 |
| /// %6 = extractelement <4 x i8> %5, i32 0 |
| /// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0 |
| /// %7 = extractelement <4 x i8> %5, i32 1 |
| /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 |
| /// %8 = extractelement <4 x i8> %5, i32 2 |
| /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 |
| /// %9 = extractelement <4 x i8> %5, i32 3 |
| /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 |
| /// ret <4 x i8> %ins4 |
| /// InstCombiner transforms this into a shuffle and vector mul |
| /// TODO: Can we split off and reuse the shuffle mask detection from |
| /// TargetTransformInfo::getInstructionThroughput? |
| static Optional<TargetTransformInfo::ShuffleKind> |
| isShuffle(ArrayRef<Value *> VL) { |
| auto *EI0 = cast<ExtractElementInst>(VL[0]); |
| unsigned Size = |
| cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); |
| Value *Vec1 = nullptr; |
| Value *Vec2 = nullptr; |
| enum ShuffleMode { Unknown, Select, Permute }; |
| ShuffleMode CommonShuffleMode = Unknown; |
| for (unsigned I = 0, E = VL.size(); I < E; ++I) { |
| auto *EI = cast<ExtractElementInst>(VL[I]); |
| auto *Vec = EI->getVectorOperand(); |
| // All vector operands must have the same number of vector elements. |
| if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size) |
| return None; |
| auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand()); |
| if (!Idx) |
| return None; |
| // Undefined behavior if Idx is negative or >= Size. |
| if (Idx->getValue().uge(Size)) |
| continue; |
| unsigned IntIdx = Idx->getValue().getZExtValue(); |
| // We can extractelement from undef vector. |
| if (isa<UndefValue>(Vec)) |
| continue; |
| // For correct shuffling we have to have at most 2 different vector operands |
| // in all extractelement instructions. |
| if (!Vec1 || Vec1 == Vec) |
| Vec1 = Vec; |
| else if (!Vec2 || Vec2 == Vec) |
| Vec2 = Vec; |
| else |
| return None; |
| if (CommonShuffleMode == Permute) |
| continue; |
| // If the extract index is not the same as the operation number, it is a |
| // permutation. |
| if (IntIdx != I) { |
| CommonShuffleMode = Permute; |
| continue; |
| } |
| CommonShuffleMode = Select; |
| } |
| // If we're not crossing lanes in different vectors, consider it as blending. |
| if (CommonShuffleMode == Select && Vec2) |
| return TargetTransformInfo::SK_Select; |
| // If Vec2 was never used, we have a permutation of a single vector, otherwise |
| // we have permutation of 2 vectors. |
| return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc |
| : TargetTransformInfo::SK_PermuteSingleSrc; |
| } |
| |
| namespace { |
| |
| /// Main data required for vectorization of instructions. |
| struct InstructionsState { |
| /// The very first instruction in the list with the main opcode. |
| Value *OpValue = nullptr; |
| |
| /// The main/alternate instruction. |
| Instruction *MainOp = nullptr; |
| Instruction *AltOp = nullptr; |
| |
| /// The main/alternate opcodes for the list of instructions. |
| unsigned getOpcode() const { |
| return MainOp ? MainOp->getOpcode() : 0; |
| } |
| |
| unsigned getAltOpcode() const { |
| return AltOp ? AltOp->getOpcode() : 0; |
| } |
| |
| /// Some of the instructions in the list have alternate opcodes. |
| bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } |
| |
| bool isOpcodeOrAlt(Instruction *I) const { |
| unsigned CheckedOpcode = I->getOpcode(); |
| return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; |
| } |
| |
| InstructionsState() = delete; |
| InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) |
| : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} |
| }; |
| |
| } // end anonymous namespace |
| |
| /// Chooses the correct key for scheduling data. If \p Op has the same (or |
| /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p |
| /// OpValue. |
| static Value *isOneOf(const InstructionsState &S, Value *Op) { |
| auto *I = dyn_cast<Instruction>(Op); |
| if (I && S.isOpcodeOrAlt(I)) |
| return Op; |
| return S.OpValue; |
| } |
| |
| /// \returns true if \p Opcode is allowed as part of of the main/alternate |
| /// instruction for SLP vectorization. |
| /// |
| /// Example of unsupported opcode is SDIV that can potentially cause UB if the |
| /// "shuffled out" lane would result in division by zero. |
| static bool isValidForAlternation(unsigned Opcode) { |
| if (Instruction::isIntDivRem(Opcode)) |
| return false; |
| |
| return true; |
| } |
| |
| /// \returns analysis of the Instructions in \p VL described in |
| /// InstructionsState, the Opcode that we suppose the whole list |
| /// could be vectorized even if its structure is diverse. |
| static InstructionsState getSameOpcode(ArrayRef<Value *> VL, |
| unsigned BaseIndex = 0) { |
| // Make sure these are all Instructions. |
| if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); })) |
| return InstructionsState(VL[BaseIndex], nullptr, nullptr); |
| |
| bool IsCastOp = isa<CastInst>(VL[BaseIndex]); |
| bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]); |
| unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode(); |
| unsigned AltOpcode = Opcode; |
| unsigned AltIndex = BaseIndex; |
| |
| // Check for one alternate opcode from another BinaryOperator. |
| // TODO - generalize to support all operators (types, calls etc.). |
| for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { |
| unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode(); |
| if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { |
| if (InstOpcode == Opcode || InstOpcode == AltOpcode) |
| continue; |
| if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && |
| isValidForAlternation(Opcode)) { |
| AltOpcode = InstOpcode; |
| AltIndex = Cnt; |
| continue; |
| } |
| } else if (IsCastOp && isa<CastInst>(VL[Cnt])) { |
| Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType(); |
| Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType(); |
| if (Ty0 == Ty1) { |
| if (InstOpcode == Opcode || InstOpcode == AltOpcode) |
| continue; |
| if (Opcode == AltOpcode) { |
| assert(isValidForAlternation(Opcode) && |
| isValidForAlternation(InstOpcode) && |
| "Cast isn't safe for alternation, logic needs to be updated!"); |
| AltOpcode = InstOpcode; |
| AltIndex = Cnt; |
| continue; |
| } |
| } |
| } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) |
| continue; |
| return InstructionsState(VL[BaseIndex], nullptr, nullptr); |
| } |
| |
| return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]), |
| cast<Instruction>(VL[AltIndex])); |
| } |
| |
| /// \returns true if all of the values in \p VL have the same type or false |
| /// otherwise. |
| static bool allSameType(ArrayRef<Value *> VL) { |
| Type *Ty = VL[0]->getType(); |
| for (int i = 1, e = VL.size(); i < e; i++) |
| if (VL[i]->getType() != Ty) |
| return false; |
| |
| return true; |
| } |
| |
| /// \returns True if Extract{Value,Element} instruction extracts element Idx. |
| static Optional<unsigned> getExtractIndex(Instruction *E) { |
| unsigned Opcode = E->getOpcode(); |
| assert((Opcode == Instruction::ExtractElement || |
| Opcode == Instruction::ExtractValue) && |
| "Expected extractelement or extractvalue instruction."); |
| if (Opcode == Instruction::ExtractElement) { |
| auto *CI = dyn_cast<ConstantInt>(E->getOperand(1)); |
| if (!CI) |
| return None; |
| return CI->getZExtValue(); |
| } |
| ExtractValueInst *EI = cast<ExtractValueInst>(E); |
| if (EI->getNumIndices() != 1) |
| return None; |
| return *EI->idx_begin(); |
| } |
| |
| /// \returns True if in-tree use also needs extract. This refers to |
| /// possible scalar operand in vectorized instruction. |
| static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, |
| TargetLibraryInfo *TLI) { |
| unsigned Opcode = UserInst->getOpcode(); |
| switch (Opcode) { |
| case Instruction::Load: { |
| LoadInst *LI = cast<LoadInst>(UserInst); |
| return (LI->getPointerOperand() == Scalar); |
| } |
| case Instruction::Store: { |
| StoreInst *SI = cast<StoreInst>(UserInst); |
| return (SI->getPointerOperand() == Scalar); |
| } |
| case Instruction::Call: { |
| CallInst *CI = cast<CallInst>(UserInst); |
| Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
| for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { |
| if (hasVectorInstrinsicScalarOpd(ID, i)) |
| return (CI->getArgOperand(i) == Scalar); |
| } |
| LLVM_FALLTHROUGH; |
| } |
| default: |
| return false; |
| } |
| } |
| |
| /// \returns the AA location that is being access by the instruction. |
| static MemoryLocation getLocation(Instruction *I, AAResults *AA) { |
| if (StoreInst *SI = dyn_cast<StoreInst>(I)) |
| return MemoryLocation::get(SI); |
| if (LoadInst *LI = dyn_cast<LoadInst>(I)) |
| return MemoryLocation::get(LI); |
| return MemoryLocation(); |
| } |
| |
| /// \returns True if the instruction is not a volatile or atomic load/store. |
| static bool isSimple(Instruction *I) { |
| if (LoadInst *LI = dyn_cast<LoadInst>(I)) |
| return LI->isSimple(); |
| if (StoreInst *SI = dyn_cast<StoreInst>(I)) |
| return SI->isSimple(); |
| if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) |
| return !MI->isVolatile(); |
| return true; |
| } |
| |
| namespace llvm { |
| |
| namespace slpvectorizer { |
| |
| /// Bottom Up SLP Vectorizer. |
| class BoUpSLP { |
| struct TreeEntry; |
| struct ScheduleData; |
| |
| public: |
| using ValueList = SmallVector<Value *, 8>; |
| using InstrList = SmallVector<Instruction *, 16>; |
| using ValueSet = SmallPtrSet<Value *, 16>; |
| using StoreList = SmallVector<StoreInst *, 8>; |
| using ExtraValueToDebugLocsMap = |
| MapVector<Value *, SmallVector<Instruction *, 2>>; |
| |
| BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, |
| TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, |
| DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, |
| const DataLayout *DL, OptimizationRemarkEmitter *ORE) |
| : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), |
| DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) { |
| CodeMetrics::collectEphemeralValues(F, AC, EphValues); |
| // Use the vector register size specified by the target unless overridden |
| // by a command-line option. |
| // TODO: It would be better to limit the vectorization factor based on |
| // data type rather than just register size. For example, x86 AVX has |
| // 256-bit registers, but it does not support integer operations |
| // at that width (that requires AVX2). |
| if (MaxVectorRegSizeOption.getNumOccurrences()) |
| MaxVecRegSize = MaxVectorRegSizeOption; |
| else |
| MaxVecRegSize = TTI->getRegisterBitWidth(true); |
| |
| if (MinVectorRegSizeOption.getNumOccurrences()) |
| MinVecRegSize = MinVectorRegSizeOption; |
| else |
| MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); |
| } |
| |
| /// Vectorize the tree that starts with the elements in \p VL. |
| /// Returns the vectorized root. |
| Value *vectorizeTree(); |
| |
| /// Vectorize the tree but with the list of externally used values \p |
| /// ExternallyUsedValues. Values in this MapVector can be replaced but the |
| /// generated extractvalue instructions. |
| Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues); |
| |
| /// \returns the cost incurred by unwanted spills and fills, caused by |
| /// holding live values over call sites. |
| int getSpillCost() const; |
| |
| /// \returns the vectorization cost of the subtree that starts at \p VL. |
| /// A negative number means that this is profitable. |
| int getTreeCost(); |
| |
| /// Construct a vectorizable tree that starts at \p Roots, ignoring users for |
| /// the purpose of scheduling and extraction in the \p UserIgnoreLst. |
| void buildTree(ArrayRef<Value *> Roots, |
| ArrayRef<Value *> UserIgnoreLst = None); |
| |
| /// Construct a vectorizable tree that starts at \p Roots, ignoring users for |
| /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking |
| /// into account (and updating it, if required) list of externally used |
| /// values stored in \p ExternallyUsedValues. |
| void buildTree(ArrayRef<Value *> Roots, |
| ExtraValueToDebugLocsMap &ExternallyUsedValues, |
| ArrayRef<Value *> UserIgnoreLst = None); |
| |
| /// Clear the internal data structures that are created by 'buildTree'. |
| void deleteTree() { |
| VectorizableTree.clear(); |
| ScalarToTreeEntry.clear(); |
| MustGather.clear(); |
| ExternalUses.clear(); |
| NumOpsWantToKeepOrder.clear(); |
| NumOpsWantToKeepOriginalOrder = 0; |
| for (auto &Iter : BlocksSchedules) { |
| BlockScheduling *BS = Iter.second.get(); |
| BS->clear(); |
| } |
| MinBWs.clear(); |
| } |
| |
| unsigned getTreeSize() const { return VectorizableTree.size(); } |
| |
| /// Perform LICM and CSE on the newly generated gather sequences. |
| void optimizeGatherSequence(); |
| |
| /// \returns The best order of instructions for vectorization. |
| Optional<ArrayRef<unsigned>> bestOrder() const { |
| auto I = std::max_element( |
| NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), |
| [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, |
| const decltype(NumOpsWantToKeepOrder)::value_type &D2) { |
| return D1.second < D2.second; |
| }); |
| if (I == NumOpsWantToKeepOrder.end() || |
| I->getSecond() <= NumOpsWantToKeepOriginalOrder) |
| return None; |
| |
| return makeArrayRef(I->getFirst()); |
| } |
| |
| /// \return The vector element size in bits to use when vectorizing the |
| /// expression tree ending at \p V. If V is a store, the size is the width of |
| /// the stored value. Otherwise, the size is the width of the largest loaded |
| /// value reaching V. This method is used by the vectorizer to calculate |
| /// vectorization factors. |
| unsigned getVectorElementSize(Value *V); |
| |
| /// Compute the minimum type sizes required to represent the entries in a |
| /// vectorizable tree. |
| void computeMinimumValueSizes(); |
| |
| // \returns maximum vector register size as set by TTI or overridden by cl::opt. |
| unsigned getMaxVecRegSize() const { |
| return MaxVecRegSize; |
| } |
| |
| // \returns minimum vector register size as set by cl::opt. |
| unsigned getMinVecRegSize() const { |
| return MinVecRegSize; |
| } |
| |
| /// Check if homogeneous aggregate is isomorphic to some VectorType. |
| /// Accepts homogeneous multidimensional aggregate of scalars/vectors like |
| /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, |
| /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. |
| /// |
| /// \returns number of elements in vector if isomorphism exists, 0 otherwise. |
| unsigned canMapToVector(Type *T, const DataLayout &DL) const; |
| |
| /// \returns True if the VectorizableTree is both tiny and not fully |
| /// vectorizable. We do not vectorize such trees. |
| bool isTreeTinyAndNotFullyVectorizable() const; |
| |
| /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values |
| /// can be load combined in the backend. Load combining may not be allowed in |
| /// the IR optimizer, so we do not want to alter the pattern. For example, |
| /// partially transforming a scalar bswap() pattern into vector code is |
| /// effectively impossible for the backend to undo. |
| /// TODO: If load combining is allowed in the IR optimizer, this analysis |
| /// may not be necessary. |
| bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; |
| |
| /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values |
| /// can be load combined in the backend. Load combining may not be allowed in |
| /// the IR optimizer, so we do not want to alter the pattern. For example, |
| /// partially transforming a scalar bswap() pattern into vector code is |
| /// effectively impossible for the backend to undo. |
| /// TODO: If load combining is allowed in the IR optimizer, this analysis |
| /// may not be necessary. |
| bool isLoadCombineCandidate() const; |
| |
| OptimizationRemarkEmitter *getORE() { return ORE; } |
| |
| /// This structure holds any data we need about the edges being traversed |
| /// during buildTree_rec(). We keep track of: |
| /// (i) the user TreeEntry index, and |
| /// (ii) the index of the edge. |
| struct EdgeInfo { |
| EdgeInfo() = default; |
| EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) |
| : UserTE(UserTE), EdgeIdx(EdgeIdx) {} |
| /// The user TreeEntry. |
| TreeEntry *UserTE = nullptr; |
| /// The operand index of the use. |
| unsigned EdgeIdx = UINT_MAX; |
| #ifndef NDEBUG |
| friend inline raw_ostream &operator<<(raw_ostream &OS, |
| const BoUpSLP::EdgeInfo &EI) { |
| EI.dump(OS); |
| return OS; |
| } |
| /// Debug print. |
| void dump(raw_ostream &OS) const { |
| OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null") |
| << " EdgeIdx:" << EdgeIdx << "}"; |
| } |
| LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } |
| #endif |
| }; |
| |
| /// A helper data structure to hold the operands of a vector of instructions. |
| /// This supports a fixed vector length for all operand vectors. |
| class VLOperands { |
| /// For each operand we need (i) the value, and (ii) the opcode that it |
| /// would be attached to if the expression was in a left-linearized form. |
| /// This is required to avoid illegal operand reordering. |
| /// For example: |
| /// \verbatim |
| /// 0 Op1 |
| /// |/ |
| /// Op1 Op2 Linearized + Op2 |
| /// \ / ----------> |/ |
| /// - - |
| /// |
| /// Op1 - Op2 (0 + Op1) - Op2 |
| /// \endverbatim |
| /// |
| /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. |
| /// |
| /// Another way to think of this is to track all the operations across the |
| /// path from the operand all the way to the root of the tree and to |
| /// calculate the operation that corresponds to this path. For example, the |
| /// path from Op2 to the root crosses the RHS of the '-', therefore the |
| /// corresponding operation is a '-' (which matches the one in the |
| /// linearized tree, as shown above). |
| /// |
| /// For lack of a better term, we refer to this operation as Accumulated |
| /// Path Operation (APO). |
| struct OperandData { |
| OperandData() = default; |
| OperandData(Value *V, bool APO, bool IsUsed) |
| : V(V), APO(APO), IsUsed(IsUsed) {} |
| /// The operand value. |
| Value *V = nullptr; |
| /// TreeEntries only allow a single opcode, or an alternate sequence of |
| /// them (e.g, +, -). Therefore, we can safely use a boolean value for the |
| /// APO. It is set to 'true' if 'V' is attached to an inverse operation |
| /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise |
| /// (e.g., Add/Mul) |
| bool APO = false; |
| /// Helper data for the reordering function. |
| bool IsUsed = false; |
| }; |
| |
| /// During operand reordering, we are trying to select the operand at lane |
| /// that matches best with the operand at the neighboring lane. Our |
| /// selection is based on the type of value we are looking for. For example, |
| /// if the neighboring lane has a load, we need to look for a load that is |
| /// accessing a consecutive address. These strategies are summarized in the |
| /// 'ReorderingMode' enumerator. |
| enum class ReorderingMode { |
| Load, ///< Matching loads to consecutive memory addresses |
| Opcode, ///< Matching instructions based on opcode (same or alternate) |
| Constant, ///< Matching constants |
| Splat, ///< Matching the same instruction multiple times (broadcast) |
| Failed, ///< We failed to create a vectorizable group |
| }; |
| |
| using OperandDataVec = SmallVector<OperandData, 2>; |
| |
| /// A vector of operand vectors. |
| SmallVector<OperandDataVec, 4> OpsVec; |
| |
| const DataLayout &DL; |
| ScalarEvolution &SE; |
| const BoUpSLP &R; |
| |
| /// \returns the operand data at \p OpIdx and \p Lane. |
| OperandData &getData(unsigned OpIdx, unsigned Lane) { |
| return OpsVec[OpIdx][Lane]; |
| } |
| |
| /// \returns the operand data at \p OpIdx and \p Lane. Const version. |
| const OperandData &getData(unsigned OpIdx, unsigned Lane) const { |
| return OpsVec[OpIdx][Lane]; |
| } |
| |
| /// Clears the used flag for all entries. |
| void clearUsed() { |
| for (unsigned OpIdx = 0, NumOperands = getNumOperands(); |
| OpIdx != NumOperands; ++OpIdx) |
| for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; |
| ++Lane) |
| OpsVec[OpIdx][Lane].IsUsed = false; |
| } |
| |
| /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. |
| void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { |
| std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); |
| } |
| |
| // The hard-coded scores listed here are not very important. When computing |
| // the scores of matching one sub-tree with another, we are basically |
| // counting the number of values that are matching. So even if all scores |
| // are set to 1, we would still get a decent matching result. |
| // However, sometimes we have to break ties. For example we may have to |
| // choose between matching loads vs matching opcodes. This is what these |
| // scores are helping us with: they provide the order of preference. |
| |
| /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). |
| static const int ScoreConsecutiveLoads = 3; |
| /// ExtractElementInst from same vector and consecutive indexes. |
| static const int ScoreConsecutiveExtracts = 3; |
| /// Constants. |
| static const int ScoreConstants = 2; |
| /// Instructions with the same opcode. |
| static const int ScoreSameOpcode = 2; |
| /// Instructions with alt opcodes (e.g, add + sub). |
| static const int ScoreAltOpcodes = 1; |
| /// Identical instructions (a.k.a. splat or broadcast). |
| static const int ScoreSplat = 1; |
| /// Matching with an undef is preferable to failing. |
| static const int ScoreUndef = 1; |
| /// Score for failing to find a decent match. |
| static const int ScoreFail = 0; |
| /// User exteranl to the vectorized code. |
| static const int ExternalUseCost = 1; |
| /// The user is internal but in a different lane. |
| static const int UserInDiffLaneCost = ExternalUseCost; |
| |
| /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. |
| static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, |
| ScalarEvolution &SE) { |
| auto *LI1 = dyn_cast<LoadInst>(V1); |
| auto *LI2 = dyn_cast<LoadInst>(V2); |
| if (LI1 && LI2) |
| return isConsecutiveAccess(LI1, LI2, DL, SE) |
| ? VLOperands::ScoreConsecutiveLoads |
| : VLOperands::ScoreFail; |
| |
| auto *C1 = dyn_cast<Constant>(V1); |
| auto *C2 = dyn_cast<Constant>(V2); |
| if (C1 && C2) |
| return VLOperands::ScoreConstants; |
| |
| // Extracts from consecutive indexes of the same vector better score as |
| // the extracts could be optimized away. |
| Value *EV; |
| ConstantInt *Ex1Idx, *Ex2Idx; |
| if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && |
| match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && |
| Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) |
| return VLOperands::ScoreConsecutiveExtracts; |
| |
| auto *I1 = dyn_cast<Instruction>(V1); |
| auto *I2 = dyn_cast<Instruction>(V2); |
| if (I1 && I2) { |
| if (I1 == I2) |
| return VLOperands::ScoreSplat; |
| InstructionsState S = getSameOpcode({I1, I2}); |
| // Note: Only consider instructions with <= 2 operands to avoid |
| // complexity explosion. |
| if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) |
| return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes |
| : VLOperands::ScoreSameOpcode; |
| } |
| |
| if (isa<UndefValue>(V2)) |
| return VLOperands::ScoreUndef; |
| |
| return VLOperands::ScoreFail; |
| } |
| |
| /// Holds the values and their lane that are taking part in the look-ahead |
| /// score calculation. This is used in the external uses cost calculation. |
| SmallDenseMap<Value *, int> InLookAheadValues; |
| |
| /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are |
| /// either external to the vectorized code, or require shuffling. |
| int getExternalUsesCost(const std::pair<Value *, int> &LHS, |
| const std::pair<Value *, int> &RHS) { |
| int Cost = 0; |
| std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}}; |
| for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { |
| Value *V = Values[Idx].first; |
| // Calculate the absolute lane, using the minimum relative lane of LHS |
| // and RHS as base and Idx as the offset. |
| int Ln = std::min(LHS.second, RHS.second) + Idx; |
| assert(Ln >= 0 && "Bad lane calculation"); |
| unsigned UsersBudget = LookAheadUsersBudget; |
| for (User *U : V->users()) { |
| if (const TreeEntry *UserTE = R.getTreeEntry(U)) { |
| // The user is in the VectorizableTree. Check if we need to insert. |
| auto It = llvm::find(UserTE->Scalars, U); |
| assert(It != UserTE->Scalars.end() && "U is in UserTE"); |
| int UserLn = std::distance(UserTE->Scalars.begin(), It); |
| assert(UserLn >= 0 && "Bad lane"); |
| if (UserLn != Ln) |
| Cost += UserInDiffLaneCost; |
| } else { |
| // Check if the user is in the look-ahead code. |
| auto It2 = InLookAheadValues.find(U); |
| if (It2 != InLookAheadValues.end()) { |
| // The user is in the look-ahead code. Check the lane. |
| if (It2->second != Ln) |
| Cost += UserInDiffLaneCost; |
| } else { |
| // The user is neither in SLP tree nor in the look-ahead code. |
| Cost += ExternalUseCost; |
| } |
| } |
| // Limit the number of visited uses to cap compilation time. |
| if (--UsersBudget == 0) |
| break; |
| } |
| } |
| return Cost; |
| } |
| |
| /// Go through the operands of \p LHS and \p RHS recursively until \p |
| /// MaxLevel, and return the cummulative score. For example: |
| /// \verbatim |
| /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] |
| /// \ / \ / \ / \ / |
| /// + + + + |
| /// G1 G2 G3 G4 |
| /// \endverbatim |
| /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at |
| /// each level recursively, accumulating the score. It starts from matching |
| /// the additions at level 0, then moves on to the loads (level 1). The |
| /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and |
| /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while |
| /// {A[0],C[0]} has a score of VLOperands::ScoreFail. |
| /// Please note that the order of the operands does not matter, as we |
| /// evaluate the score of all profitable combinations of operands. In |
| /// other words the score of G1 and G4 is the same as G1 and G2. This |
| /// heuristic is based on ideas described in: |
| /// Look-ahead SLP: Auto-vectorization in the presence of commutative |
| /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, |
| /// LuÃs F. W. Góes |
| int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, |
| const std::pair<Value *, int> &RHS, int CurrLevel, |
| int MaxLevel) { |
| |
| Value *V1 = LHS.first; |
| Value *V2 = RHS.first; |
| // Get the shallow score of V1 and V2. |
| int ShallowScoreAtThisLevel = |
| std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - |
| getExternalUsesCost(LHS, RHS)); |
| int Lane1 = LHS.second; |
| int Lane2 = RHS.second; |
| |
| // If reached MaxLevel, |
| // or if V1 and V2 are not instructions, |
| // or if they are SPLAT, |
| // or if they are not consecutive, early return the current cost. |
| auto *I1 = dyn_cast<Instruction>(V1); |
| auto *I2 = dyn_cast<Instruction>(V2); |
| if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || |
| ShallowScoreAtThisLevel == VLOperands::ScoreFail || |
| (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) |
| return ShallowScoreAtThisLevel; |
| assert(I1 && I2 && "Should have early exited."); |
| |
| // Keep track of in-tree values for determining the external-use cost. |
| InLookAheadValues[V1] = Lane1; |
| InLookAheadValues[V2] = Lane2; |
| |
| // Contains the I2 operand indexes that got matched with I1 operands. |
| SmallSet<unsigned, 4> Op2Used; |
| |
| // Recursion towards the operands of I1 and I2. We are trying all possbile |
| // operand pairs, and keeping track of the best score. |
| for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); |
| OpIdx1 != NumOperands1; ++OpIdx1) { |
| // Try to pair op1I with the best operand of I2. |
| int MaxTmpScore = 0; |
| unsigned MaxOpIdx2 = 0; |
| bool FoundBest = false; |
| // If I2 is commutative try all combinations. |
| unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; |
| unsigned ToIdx = isCommutative(I2) |
| ? I2->getNumOperands() |
| : std::min(I2->getNumOperands(), OpIdx1 + 1); |
| assert(FromIdx <= ToIdx && "Bad index"); |
| for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { |
| // Skip operands already paired with OpIdx1. |
| if (Op2Used.count(OpIdx2)) |
| continue; |
| // Recursively calculate the cost at each level |
| int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, |
| {I2->getOperand(OpIdx2), Lane2}, |
| CurrLevel + 1, MaxLevel); |
| // Look for the best score. |
| if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { |
| MaxTmpScore = TmpScore; |
| MaxOpIdx2 = OpIdx2; |
| FoundBest = true; |
| } |
| } |
| if (FoundBest) { |
| // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. |
| Op2Used.insert(MaxOpIdx2); |
| ShallowScoreAtThisLevel += MaxTmpScore; |
| } |
| } |
| return ShallowScoreAtThisLevel; |
| } |
| |
| /// \Returns the look-ahead score, which tells us how much the sub-trees |
| /// rooted at \p LHS and \p RHS match, the more they match the higher the |
| /// score. This helps break ties in an informed way when we cannot decide on |
| /// the order of the operands by just considering the immediate |
| /// predecessors. |
| int getLookAheadScore(const std::pair<Value *, int> &LHS, |
| const std::pair<Value *, int> &RHS) { |
| InLookAheadValues.clear(); |
| return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); |
| } |
| |
| // Search all operands in Ops[*][Lane] for the one that matches best |
| // Ops[OpIdx][LastLane] and return its opreand index. |
| // If no good match can be found, return None. |
| Optional<unsigned> |
| getBestOperand(unsigned OpIdx, int Lane, int LastLane, |
| ArrayRef<ReorderingMode> ReorderingModes) { |
| unsigned NumOperands = getNumOperands(); |
| |
| // The operand of the previous lane at OpIdx. |
| Value *OpLastLane = getData(OpIdx, LastLane).V; |
| |
| // Our strategy mode for OpIdx. |
| ReorderingMode RMode = ReorderingModes[OpIdx]; |
| |
| // The linearized opcode of the operand at OpIdx, Lane. |
| bool OpIdxAPO = getData(OpIdx, Lane).APO; |
| |
| // The best operand index and its score. |
| // Sometimes we have more than one option (e.g., Opcode and Undefs), so we |
| // are using the score to differentiate between the two. |
| struct BestOpData { |
| Optional<unsigned> Idx = None; |
| unsigned Score = 0; |
| } BestOp; |
| |
| // Iterate through all unused operands and look for the best. |
| for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { |
| // Get the operand at Idx and Lane. |
| OperandData &OpData = getData(Idx, Lane); |
| Value *Op = OpData.V; |
| bool OpAPO = OpData.APO; |
| |
| // Skip already selected operands. |
| if (OpData.IsUsed) |
| continue; |
| |
| // Skip if we are trying to move the operand to a position with a |
| // different opcode in the linearized tree form. This would break the |
| // semantics. |
| if (OpAPO != OpIdxAPO) |
| continue; |
| |
| // Look for an operand that matches the current mode. |
| switch (RMode) { |
| case ReorderingMode::Load: |
| case ReorderingMode::Constant: |
| case ReorderingMode::Opcode: { |
| bool LeftToRight = Lane > LastLane; |
| Value *OpLeft = (LeftToRight) ? OpLastLane : Op; |
| Value *OpRight = (LeftToRight) ? Op : OpLastLane; |
| unsigned Score = |
| getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); |
| if (Score > BestOp.Score) { |
| BestOp.Idx = Idx; |
| BestOp.Score = Score; |
| } |
| break; |
| } |
| case ReorderingMode::Splat: |
| if (Op == OpLastLane) |
| BestOp.Idx = Idx; |
| break; |
| case ReorderingMode::Failed: |
| return None; |
| } |
| } |
| |
| if (BestOp.Idx) { |
| getData(BestOp.Idx.getValue(), Lane).IsUsed = true; |
| return BestOp.Idx; |
| } |
| // If we could not find a good match return None. |
| return None; |
| } |
| |
| /// Helper for reorderOperandVecs. \Returns the lane that we should start |
| /// reordering from. This is the one which has the least number of operands |
| /// that can freely move about. |
| unsigned getBestLaneToStartReordering() const { |
| unsigned BestLane = 0; |
| unsigned Min = UINT_MAX; |
| for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; |
| ++Lane) { |
| unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); |
| if (NumFreeOps < Min) { |
| Min = NumFreeOps; |
| BestLane = Lane; |
| } |
| } |
| return BestLane; |
| } |
| |
| /// \Returns the maximum number of operands that are allowed to be reordered |
| /// for \p Lane. This is used as a heuristic for selecting the first lane to |
| /// start operand reordering. |
| unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { |
| unsigned CntTrue = 0; |
| unsigned NumOperands = getNumOperands(); |
| // Operands with the same APO can be reordered. We therefore need to count |
| // how many of them we have for each APO, like this: Cnt[APO] = x. |
| // Since we only have two APOs, namely true and false, we can avoid using |
| // a map. Instead we can simply count the number of operands that |
| // correspond to one of them (in this case the 'true' APO), and calculate |
| // the other by subtracting it from the total number of operands. |
| for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) |
| if (getData(OpIdx, Lane).APO) |
| ++CntTrue; |
| unsigned CntFalse = NumOperands - CntTrue; |
| return std::max(CntTrue, CntFalse); |
| } |
| |
| /// Go through the instructions in VL and append their operands. |
| void appendOperandsOfVL(ArrayRef<Value *> VL) { |
| assert(!VL.empty() && "Bad VL"); |
| assert((empty() || VL.size() == getNumLanes()) && |
| "Expected same number of lanes"); |
| assert(isa<Instruction>(VL[0]) && "Expected instruction"); |
| unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); |
| OpsVec.resize(NumOperands); |
| unsigned NumLanes = VL.size(); |
| for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { |
| OpsVec[OpIdx].resize(NumLanes); |
| for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { |
| assert(isa<Instruction>(VL[Lane]) && "Expected instruction"); |
| // Our tree has just 3 nodes: the root and two operands. |
| // It is therefore trivial to get the APO. We only need to check the |
| // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or |
| // RHS operand. The LHS operand of both add and sub is never attached |
| // to an inversese operation in the linearized form, therefore its APO |
| // is false. The RHS is true only if VL[Lane] is an inverse operation. |
| |
| // Since operand reordering is performed on groups of commutative |
| // operations or alternating sequences (e.g., +, -), we can safely |
| // tell the inverse operations by checking commutativity. |
| bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane])); |
| bool APO = (OpIdx == 0) ? false : IsInverseOperation; |
| OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx), |
| APO, false}; |
| } |
| } |
| } |
| |
| /// \returns the number of operands. |
| unsigned getNumOperands() const { return OpsVec.size(); } |
| |
| /// \returns the number of lanes. |
| unsigned getNumLanes() const { return OpsVec[0].size(); } |
| |
| /// \returns the operand value at \p OpIdx and \p Lane. |
| Value *getValue(unsigned OpIdx, unsigned Lane) const { |
| return getData(OpIdx, Lane).V; |
| } |
| |
| /// \returns true if the data structure is empty. |
| bool empty() const { return OpsVec.empty(); } |
| |
| /// Clears the data. |
| void clear() { OpsVec.clear(); } |
| |
| /// \Returns true if there are enough operands identical to \p Op to fill |
| /// the whole vector. |
| /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. |
| bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { |
| bool OpAPO = getData(OpIdx, Lane).APO; |
| for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { |
| if (Ln == Lane) |
| continue; |
| // This is set to true if we found a candidate for broadcast at Lane. |
| bool FoundCandidate = false; |
| for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { |
| OperandData &Data = getData(OpI, Ln); |
| if (Data.APO != OpAPO || Data.IsUsed) |
| continue; |
| if (Data.V == Op) { |
| FoundCandidate = true; |
| Data.IsUsed = true; |
| break; |
| } |
| } |
| if (!FoundCandidate) |
| return false; |
| } |
| return true; |
| } |
| |
| public: |
| /// Initialize with all the operands of the instruction vector \p RootVL. |
| VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, |
| ScalarEvolution &SE, const BoUpSLP &R) |
| : DL(DL), SE(SE), R(R) { |
| // Append all the operands of RootVL. |
| appendOperandsOfVL(RootVL); |
| } |
| |
| /// \Returns a value vector with the operands across all lanes for the |
| /// opearnd at \p OpIdx. |
| ValueList getVL(unsigned OpIdx) const { |
| ValueList OpVL(OpsVec[OpIdx].size()); |
| assert(OpsVec[OpIdx].size() == getNumLanes() && |
| "Expected same num of lanes across all operands"); |
| for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) |
| OpVL[Lane] = OpsVec[OpIdx][Lane].V; |
| return OpVL; |
| } |
| |
| // Performs operand reordering for 2 or more operands. |
| // The original operands are in OrigOps[OpIdx][Lane]. |
| // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. |
| void reorder() { |
| unsigned NumOperands = getNumOperands(); |
| unsigned NumLanes = getNumLanes(); |
| // Each operand has its own mode. We are using this mode to help us select |
| // the instructions for each lane, so that they match best with the ones |
| // we have selected so far. |
| SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); |
| |
| // This is a greedy single-pass algorithm. We are going over each lane |
| // once and deciding on the best order right away with no back-tracking. |
| // However, in order to increase its effectiveness, we start with the lane |
| // that has operands that can move the least. For example, given the |
| // following lanes: |
| // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd |
| // Lane 1 : A[1] = C[1] - B[1] // Visited 1st |
| // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd |
| // Lane 3 : A[3] = C[3] - B[3] // Visited 4th |
| // we will start at Lane 1, since the operands of the subtraction cannot |
| // be reordered. Then we will visit the rest of the lanes in a circular |
| // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. |
| |
| // Find the first lane that we will start our search from. |
| unsigned FirstLane = getBestLaneToStartReordering(); |
| |
| // Initialize the modes. |
| for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { |
| Value *OpLane0 = getValue(OpIdx, FirstLane); |
| // Keep track if we have instructions with all the same opcode on one |
| // side. |
| if (isa<LoadInst>(OpLane0)) |
| ReorderingModes[OpIdx] = ReorderingMode::Load; |
| else if (isa<Instruction>(OpLane0)) { |
| // Check if OpLane0 should be broadcast. |
| if (shouldBroadcast(OpLane0, OpIdx, FirstLane)) |
| ReorderingModes[OpIdx] = ReorderingMode::Splat; |
| else |
| ReorderingModes[OpIdx] = ReorderingMode::Opcode; |
| } |
| else if (isa<Constant>(OpLane0)) |
| ReorderingModes[OpIdx] = ReorderingMode::Constant; |
| else if (isa<Argument>(OpLane0)) |
| // Our best hope is a Splat. It may save some cost in some cases. |
| ReorderingModes[OpIdx] = ReorderingMode::Splat; |
| else |
| // NOTE: This should be unreachable. |
| ReorderingModes[OpIdx] = ReorderingMode::Failed; |
| } |
| |
| // If the initial strategy fails for any of the operand indexes, then we |
| // perform reordering again in a second pass. This helps avoid assigning |
| // high priority to the failed strategy, and should improve reordering for |
| // the non-failed operand indexes. |
| for (int Pass = 0; Pass != 2; ++Pass) { |
| // Skip the second pass if the first pass did not fail. |
| bool StrategyFailed = false; |
| // Mark all operand data as free to use. |
| clearUsed(); |
| // We keep the original operand order for the FirstLane, so reorder the |
| // rest of the lanes. We are visiting the nodes in a circular fashion, |
| // using FirstLane as the center point and increasing the radius |
| // distance. |
| for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { |
| // Visit the lane on the right and then the lane on the left. |
| for (int Direction : {+1, -1}) { |
| int Lane = FirstLane + Direction * Distance; |
| if (Lane < 0 || Lane >= (int)NumLanes) |
| continue; |
| int LastLane = Lane - Direction; |
| assert(LastLane >= 0 && LastLane < (int)NumLanes && |
| "Out of bounds"); |
| // Look for a good match for each operand. |
| for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { |
| // Search for the operand that matches SortedOps[OpIdx][Lane-1]. |
| Optional<unsigned> BestIdx = |
| getBestOperand(OpIdx, Lane, LastLane, ReorderingModes); |
| // By not selecting a value, we allow the operands that follow to |
| // select a better matching value. We will get a non-null value in |
| // the next run of getBestOperand(). |
| if (BestIdx) { |
| // Swap the current operand with the one returned by |
| // getBestOperand(). |
| swap(OpIdx, BestIdx.getValue(), Lane); |
| } else { |
| // We failed to find a best operand, set mode to 'Failed'. |
| ReorderingModes[OpIdx] = ReorderingMode::Failed; |
| // Enable the second pass. |
| StrategyFailed = true; |
| } |
| } |
| } |
| } |
| // Skip second pass if the strategy did not fail. |
| if (!StrategyFailed) |
| break; |
| } |
| } |
| |
| #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { |
| switch (RMode) { |
| case ReorderingMode::Load: |
| return "Load"; |
| case ReorderingMode::Opcode: |
| return "Opcode"; |
| case ReorderingMode::Constant: |
| return "Constant"; |
| case ReorderingMode::Splat: |
| return "Splat"; |
| case ReorderingMode::Failed: |
| return "Failed"; |
| } |
| llvm_unreachable("Unimplemented Reordering Type"); |
| } |
| |
| LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, |
| raw_ostream &OS) { |
| return OS << getModeStr(RMode); |
| } |
| |
| /// Debug print. |
| LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { |
| printMode(RMode, dbgs()); |
| } |
| |
| friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { |
| return printMode(RMode, OS); |
| } |
| |
| LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { |
| const unsigned Indent = 2; |
| unsigned Cnt = 0; |
| for (const OperandDataVec &OpDataVec : OpsVec) { |
| OS << "Operand " << Cnt++ << "\n"; |
| for (const OperandData &OpData : OpDataVec) { |
| OS.indent(Indent) << "{"; |
| if (Value *V = OpData.V) |
| OS << *V; |
| else |
| OS << "null"; |
| OS << ", APO:" << OpData.APO << "}\n"; |
| } |
| OS << "\n"; |
| } |
| return OS; |
| } |
| |
| /// Debug print. |
| LLVM_DUMP_METHOD void dump() const { print(dbgs()); } |
| #endif |
| }; |
| |
| /// Checks if the instruction is marked for deletion. |
| bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } |
| |
| /// Marks values operands for later deletion by replacing them with Undefs. |
| void eraseInstructions(ArrayRef<Value *> AV); |
| |
| ~BoUpSLP(); |
| |
| private: |
| /// Checks if all users of \p I are the part of the vectorization tree. |
| bool areAllUsersVectorized(Instruction *I) const; |
| |
| /// \returns the cost of the vectorizable entry. |
| int getEntryCost(TreeEntry *E); |
| |
| /// This is the recursive part of buildTree. |
| void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, |
| const EdgeInfo &EI); |
| |
| /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can |
| /// be vectorized to use the original vector (or aggregate "bitcast" to a |
| /// vector) and sets \p CurrentOrder to the identity permutation; otherwise |
| /// returns false, setting \p CurrentOrder to either an empty vector or a |
| /// non-identity permutation that allows to reuse extract instructions. |
| bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, |
| SmallVectorImpl<unsigned> &CurrentOrder) const; |
| |
| /// Vectorize a single entry in the tree. |
| Value *vectorizeTree(TreeEntry *E); |
| |
| /// Vectorize a single entry in the tree, starting in \p VL. |
| Value *vectorizeTree(ArrayRef<Value *> VL); |
| |
| /// \returns the scalarization cost for this type. Scalarization in this |
| /// context means the creation of vectors from a group of scalars. |
| int getGatherCost(FixedVectorType *Ty, |
| const DenseSet<unsigned> &ShuffledIndices) const; |
| |
| /// \returns the scalarization cost for this list of values. Assuming that |
| /// this subtree gets vectorized, we may need to extract the values from the |
| /// roots. This method calculates the cost of extracting the values. |
| int getGatherCost(ArrayRef<Value *> VL) const; |
| |
| /// Set the Builder insert point to one after the last instruction in |
| /// the bundle |
| void setInsertPointAfterBundle(TreeEntry *E); |
| |
| /// \returns a vector from a collection of scalars in \p VL. |
| Value *Gather(ArrayRef<Value *> VL, FixedVectorType *Ty); |
| |
| /// \returns whether the VectorizableTree is fully vectorizable and will |
| /// be beneficial even the tree height is tiny. |
| bool isFullyVectorizableTinyTree() const; |
| |
| /// Reorder commutative or alt operands to get better probability of |
| /// generating vectorized code. |
| static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, |
| SmallVectorImpl<Value *> &Left, |
| SmallVectorImpl<Value *> &Right, |
| const DataLayout &DL, |
| ScalarEvolution &SE, |
| const BoUpSLP &R); |
| struct TreeEntry { |
| using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; |
| TreeEntry(VecTreeTy &Container) : Container(Container) {} |
| |
| /// \returns true if the scalars in VL are equal to this entry. |
| bool isSame(ArrayRef<Value *> VL) const { |
| if (VL.size() == Scalars.size()) |
| return std::equal(VL.begin(), VL.end(), Scalars.begin()); |
| return VL.size() == ReuseShuffleIndices.size() && |
| std::equal( |
| VL.begin(), VL.end(), ReuseShuffleIndices.begin(), |
| [this](Value *V, int Idx) { return V == Scalars[Idx]; }); |
| } |
| |
| /// A vector of scalars. |
| ValueList Scalars; |
| |
| /// The Scalars are vectorized into this value. It is initialized to Null. |
| Value *VectorizedValue = nullptr; |
| |
| /// Do we need to gather this sequence ? |
| enum EntryState { Vectorize, NeedToGather }; |
| EntryState State; |
| |
| /// Does this sequence require some shuffling? |
| SmallVector<int, 4> ReuseShuffleIndices; |
| |
| /// Does this entry require reordering? |
| ArrayRef<unsigned> ReorderIndices; |
| |
| /// Points back to the VectorizableTree. |
| /// |
| /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has |
| /// to be a pointer and needs to be able to initialize the child iterator. |
| /// Thus we need a reference back to the container to translate the indices |
| /// to entries. |
| VecTreeTy &Container; |
| |
| /// The TreeEntry index containing the user of this entry. We can actually |
| /// have multiple users so the data structure is not truly a tree. |
| SmallVector<EdgeInfo, 1> UserTreeIndices; |
| |
| /// The index of this treeEntry in VectorizableTree. |
| int Idx = -1; |
| |
| private: |
| /// The operands of each instruction in each lane Operands[op_index][lane]. |
| /// Note: This helps avoid the replication of the code that performs the |
| /// reordering of operands during buildTree_rec() and vectorizeTree(). |
| SmallVector<ValueList, 2> Operands; |
| |
| /// The main/alternate instruction. |
| Instruction *MainOp = nullptr; |
| Instruction *AltOp = nullptr; |
| |
| public: |
| /// Set this bundle's \p OpIdx'th operand to \p OpVL. |
| void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { |
| if (Operands.size() < OpIdx + 1) |
| Operands.resize(OpIdx + 1); |
| assert(Operands[OpIdx].size() == 0 && "Already resized?"); |
| Operands[OpIdx].resize(Scalars.size()); |
| for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) |
| Operands[OpIdx][Lane] = OpVL[Lane]; |
| } |
| |
| /// Set the operands of this bundle in their original order. |
| void setOperandsInOrder() { |
| assert(Operands.empty() && "Already initialized?"); |
| auto *I0 = cast<Instruction>(Scalars[0]); |
| Operands.resize(I0->getNumOperands()); |
| unsigned NumLanes = Scalars.size(); |
| for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); |
| OpIdx != NumOperands; ++OpIdx) { |
| Operands[OpIdx].resize(NumLanes); |
| for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { |
| auto *I = cast<Instruction>(Scalars[Lane]); |
| assert(I->getNumOperands() == NumOperands && |
| "Expected same number of operands"); |
| Operands[OpIdx][Lane] = I->getOperand(OpIdx); |
| } |
| } |
| } |
| |
| /// \returns the \p OpIdx operand of this TreeEntry. |
| ValueList &getOperand(unsigned OpIdx) { |
| assert(OpIdx < Operands.size() && "Off bounds"); |
| return Operands[OpIdx]; |
| } |
| |
| /// \returns the number of operands. |
| unsigned getNumOperands() const { return Operands.size(); } |
| |
| /// \return the single \p OpIdx operand. |
| Value *getSingleOperand(unsigned OpIdx) const { |
| assert(OpIdx < Operands.size() && "Off bounds"); |
| assert(!Operands[OpIdx].empty() && "No operand available"); |
| return Operands[OpIdx][0]; |
| } |
| |
| /// Some of the instructions in the list have alternate opcodes. |
| bool isAltShuffle() const { |
| return getOpcode() != getAltOpcode(); |
| } |
| |
| bool isOpcodeOrAlt(Instruction *I) const { |
| unsigned CheckedOpcode = I->getOpcode(); |
| return (getOpcode() == CheckedOpcode || |
| getAltOpcode() == CheckedOpcode); |
| } |
| |
| /// Chooses the correct key for scheduling data. If \p Op has the same (or |
| /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is |
| /// \p OpValue. |
| Value *isOneOf(Value *Op) const { |
| auto *I = dyn_cast<Instruction>(Op); |
| if (I && isOpcodeOrAlt(I)) |
| return Op; |
| return MainOp; |
| } |
| |
| void setOperations(const InstructionsState &S) { |
| MainOp = S.MainOp; |
| AltOp = S.AltOp; |
| } |
| |
| Instruction *getMainOp() const { |
| return MainOp; |
| } |
| |
| Instruction *getAltOp() const { |
| return AltOp; |
| } |
| |
| /// The main/alternate opcodes for the list of instructions. |
| unsigned getOpcode() const { |
| return MainOp ? MainOp->getOpcode() : 0; |
| } |
| |
| unsigned getAltOpcode() const { |
| return AltOp ? AltOp->getOpcode() : 0; |
| } |
| |
| /// Update operations state of this entry if reorder occurred. |
| bool updateStateIfReorder() { |
| if (ReorderIndices.empty()) |
| return false; |
| InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); |
| setOperations(S); |
| return true; |
| } |
| |
| #ifndef NDEBUG |
| /// Debug printer. |
| LLVM_DUMP_METHOD void dump() const { |
| dbgs() << Idx << ".\n"; |
| for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { |
| dbgs() << "Operand " << OpI << ":\n"; |
| for (const Value *V : Operands[OpI]) |
| dbgs().indent(2) << *V << "\n"; |
| } |
| dbgs() << "Scalars: \n"; |
| for (Value *V : Scalars) |
| dbgs().indent(2) << *V << "\n"; |
| dbgs() << "State: "; |
| switch (State) { |
| case Vectorize: |
| dbgs() << "Vectorize\n"; |
| break; |
| case NeedToGather: |
| dbgs() << "NeedToGather\n"; |
| break; |
| } |
| dbgs() << "MainOp: "; |
| if (MainOp) |
| dbgs() << *MainOp << "\n"; |
| else |
| dbgs() << "NULL\n"; |
| dbgs() << "AltOp: "; |
| if (AltOp) |
| dbgs() << *AltOp << "\n"; |
| else |
| dbgs() << "NULL\n"; |
| dbgs() << "VectorizedValue: "; |
| if (VectorizedValue) |
| dbgs() << *VectorizedValue << "\n"; |
| else |
| dbgs() << "NULL\n"; |
| dbgs() << "ReuseShuffleIndices: "; |
| if (ReuseShuffleIndices.empty()) |
| dbgs() << "Emtpy"; |
| else |
| for (unsigned ReuseIdx : ReuseShuffleIndices) |
| dbgs() << ReuseIdx << ", "; |
| dbgs() << "\n"; |
| dbgs() << "ReorderIndices: "; |
| for (unsigned ReorderIdx : ReorderIndices) |
| dbgs() << ReorderIdx << ", "; |
| dbgs() << "\n"; |
| dbgs() << "UserTreeIndices: "; |
| for (const auto &EInfo : UserTreeIndices) |
| dbgs() << EInfo << ", "; |
| dbgs() << "\n"; |
| } |
| #endif |
| }; |
| |
| /// Create a new VectorizableTree entry. |
| TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, |
| const InstructionsState &S, |
| const EdgeInfo &UserTreeIdx, |
| ArrayRef<unsigned> ReuseShuffleIndices = None, |
| ArrayRef<unsigned> ReorderIndices = None) { |
| bool Vectorized = (bool)Bundle; |
| VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); |
| TreeEntry *Last = VectorizableTree.back().get(); |
| Last->Idx = VectorizableTree.size() - 1; |
| Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); |
| Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather; |
| Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), |
| ReuseShuffleIndices.end()); |
| Last->ReorderIndices = ReorderIndices; |
| Last->setOperations(S); |
| if (Vectorized) { |
| for (int i = 0, e = VL.size(); i != e; ++i) { |
| assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); |
| ScalarToTreeEntry[VL[i]] = Last; |
| } |
| // Update the scheduler bundle to point to this TreeEntry. |
| unsigned Lane = 0; |
| for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; |
| BundleMember = BundleMember->NextInBundle) { |
| BundleMember->TE = Last; |
| BundleMember->Lane = Lane; |
| ++Lane; |
| } |
| assert((!Bundle.getValue() || Lane == VL.size()) && |
| "Bundle and VL out of sync"); |
| } else { |
| MustGather.insert(VL.begin(), VL.end()); |
| } |
| |
| if (UserTreeIdx.UserTE) |
| Last->UserTreeIndices.push_back(UserTreeIdx); |
| |
| return Last; |
| } |
| |
| /// -- Vectorization State -- |
| /// Holds all of the tree entries. |
| TreeEntry::VecTreeTy VectorizableTree; |
| |
| #ifndef NDEBUG |
| /// Debug printer. |
| LLVM_DUMP_METHOD void dumpVectorizableTree() const { |
| for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { |
| VectorizableTree[Id]->dump(); |
| dbgs() << "\n"; |
| } |
| } |
| #endif |
| |
| TreeEntry *getTreeEntry(Value *V) { |
| auto I = ScalarToTreeEntry.find(V); |
| if (I != ScalarToTreeEntry.end()) |
| return I->second; |
| return nullptr; |
| } |
| |
| const TreeEntry *getTreeEntry(Value *V) const { |
| auto I = ScalarToTreeEntry.find(V); |
| if (I != ScalarToTreeEntry.end()) |
| return I->second; |
| return nullptr; |
| } |
| |
| /// Maps a specific scalar to its tree entry. |
| SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; |
| |
| /// Maps a value to the proposed vectorizable size. |
| SmallDenseMap<Value *, unsigned> InstrElementSize; |
| |
| /// A list of scalars that we found that we need to keep as scalars. |
| ValueSet MustGather; |
| |
| /// This POD struct describes one external user in the vectorized tree. |
| struct ExternalUser { |
| ExternalUser(Value *S, llvm::User *U, int L) |
| : Scalar(S), User(U), Lane(L) {} |
| |
| // Which scalar in our function. |
| Value *Scalar; |
| |
| // Which user that uses the scalar. |
| llvm::User *User; |
| |
| // Which lane does the scalar belong to. |
| int Lane; |
| }; |
| using UserList = SmallVector<ExternalUser, 16>; |
| |
| /// Checks if two instructions may access the same memory. |
| /// |
| /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it |
| /// is invariant in the calling loop. |
| bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, |
| Instruction *Inst2) { |
| // First check if the result is already in the cache. |
| AliasCacheKey key = std::make_pair(Inst1, Inst2); |
| Optional<bool> &result = AliasCache[key]; |
| if (result.hasValue()) { |
| return result.getValue(); |
| } |
| MemoryLocation Loc2 = getLocation(Inst2, AA); |
| bool aliased = true; |
| if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { |
| // Do the alias check. |
| aliased = AA->alias(Loc1, Loc2); |
| } |
| // Store the result in the cache. |
| result = aliased; |
| return aliased; |
| } |
| |
| using AliasCacheKey = std::pair<Instruction *, Instruction *>; |
| |
| /// Cache for alias results. |
| /// TODO: consider moving this to the AliasAnalysis itself. |
| DenseMap<AliasCacheKey, Optional<bool>> AliasCache; |
| |
| /// Removes an instruction from its block and eventually deletes it. |
| /// It's like Instruction::eraseFromParent() except that the actual deletion |
| /// is delayed until BoUpSLP is destructed. |
| /// This is required to ensure that there are no incorrect collisions in the |
| /// AliasCache, which can happen if a new instruction is allocated at the |
| /// same address as a previously deleted instruction. |
| void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) { |
| auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first; |
| It->getSecond() = It->getSecond() && ReplaceOpsWithUndef; |
| } |
| |
| /// Temporary store for deleted instructions. Instructions will be deleted |
| /// eventually when the BoUpSLP is destructed. |
| DenseMap<Instruction *, bool> DeletedInstructions; |
| |
| /// A list of values that need to extracted out of the tree. |
| /// This list holds pairs of (Internal Scalar : External User). External User |
| /// can be nullptr, it means that this Internal Scalar will be used later, |
| /// after vectorization. |
| UserList ExternalUses; |
| |
| /// Values used only by @llvm.assume calls. |
| SmallPtrSet<const Value *, 32> EphValues; |
| |
| /// Holds all of the instructions that we gathered. |
| SetVector<Instruction *> GatherSeq; |
| |
| /// A list of blocks that we are going to CSE. |
| SetVector<BasicBlock *> CSEBlocks; |
| |
| /// Contains all scheduling relevant data for an instruction. |
| /// A ScheduleData either represents a single instruction or a member of an |
| /// instruction bundle (= a group of instructions which is combined into a |
| /// vector instruction). |
| struct ScheduleData { |
| // The initial value for the dependency counters. It means that the |
| // dependencies are not calculated yet. |
| enum { InvalidDeps = -1 }; |
| |
| ScheduleData() = default; |
| |
| void init(int BlockSchedulingRegionID, Value *OpVal) { |
| FirstInBundle = this; |
| NextInBundle = nullptr; |
| NextLoadStore = nullptr; |
| IsScheduled = false; |
| SchedulingRegionID = BlockSchedulingRegionID; |
| UnscheduledDepsInBundle = UnscheduledDeps; |
| clearDependencies(); |
| OpValue = OpVal; |
| TE = nullptr; |
| Lane = -1; |
| } |
| |
| /// Returns true if the dependency information has been calculated. |
| bool hasValidDependencies() const { return Dependencies != InvalidDeps; } |
| |
| /// Returns true for single instructions and for bundle representatives |
| /// (= the head of a bundle). |
| bool isSchedulingEntity() const { return FirstInBundle == this; } |
| |
| /// Returns true if it represents an instruction bundle and not only a |
| /// single instruction. |
| bool isPartOfBundle() const { |
| return NextInBundle != nullptr || FirstInBundle != this; |
| } |
| |
| /// Returns true if it is ready for scheduling, i.e. it has no more |
| /// unscheduled depending instructions/bundles. |
| bool isReady() const { |
| assert(isSchedulingEntity() && |
| "can't consider non-scheduling entity for ready list"); |
| return UnscheduledDepsInBundle == 0 && !IsScheduled; |
| } |
| |
| /// Modifies the number of unscheduled dependencies, also updating it for |
| /// the whole bundle. |
| int incrementUnscheduledDeps(int Incr) { |
| UnscheduledDeps += Incr; |
| return FirstInBundle->UnscheduledDepsInBundle += Incr; |
| } |
| |
| /// Sets the number of unscheduled dependencies to the number of |
| /// dependencies. |
| void resetUnscheduledDeps() { |
| incrementUnscheduledDeps(Dependencies - UnscheduledDeps); |
| } |
| |
| /// Clears all dependency information. |
| void clearDependencies() { |
| Dependencies = InvalidDeps; |
| resetUnscheduledDeps(); |
| MemoryDependencies.clear(); |
| } |
| |
| void dump(raw_ostream &os) const { |
| if (!isSchedulingEntity()) { |
| os << "/ " << *Inst; |
| } else if (NextInBundle) { |
| os << '[' << *Inst; |
| ScheduleData *SD = NextInBundle; |
| while (SD) { |
| os << ';' << *SD->Inst; |
| SD = SD->NextInBundle; |
| } |
| os << ']'; |
| } else { |
| os << *Inst; |
| } |
| } |
| |
| Instruction *Inst = nullptr; |
| |
| /// Points to the head in an instruction bundle (and always to this for |
| /// single instructions). |
| ScheduleData *FirstInBundle = nullptr; |
| |
| /// Single linked list of all instructions in a bundle. Null if it is a |
| /// single instruction. |
| ScheduleData *NextInBundle = nullptr; |
| |
| /// Single linked list of all memory instructions (e.g. load, store, call) |
| /// in the block - until the end of the scheduling region. |
| ScheduleData *NextLoadStore = nullptr; |
| |
| /// The dependent memory instructions. |
| /// This list is derived on demand in calculateDependencies(). |
| SmallVector<ScheduleData *, 4> MemoryDependencies; |
| |
| /// This ScheduleData is in the current scheduling region if this matches |
| /// the current SchedulingRegionID of BlockScheduling. |
| int SchedulingRegionID = 0; |
| |
| /// Used for getting a "good" final ordering of instructions. |
| int SchedulingPriority = 0; |
| |
| /// The number of dependencies. Constitutes of the number of users of the |
| /// instruction plus the number of dependent memory instructions (if any). |
| /// This value is calculated on demand. |
| /// If InvalidDeps, the number of dependencies is not calculated yet. |
| int Dependencies = InvalidDeps; |
| |
| /// The number of dependencies minus the number of dependencies of scheduled |
| /// instructions. As soon as this is zero, the instruction/bundle gets ready |
| /// for scheduling. |
| /// Note that this is negative as long as Dependencies is not calculated. |
| int UnscheduledDeps = InvalidDeps; |
| |
| /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for |
| /// single instructions. |
| int UnscheduledDepsInBundle = InvalidDeps; |
| |
| /// True if this instruction is scheduled (or considered as scheduled in the |
| /// dry-run). |
| bool IsScheduled = false; |
| |
| /// Opcode of the current instruction in the schedule data. |
| Value *OpValue = nullptr; |
| |
| /// The TreeEntry that this instruction corresponds to. |
| TreeEntry *TE = nullptr; |
| |
| /// The lane of this node in the TreeEntry. |
| int Lane = -1; |
| }; |
| |
| #ifndef NDEBUG |
| friend inline raw_ostream &operator<<(raw_ostream &os, |
| const BoUpSLP::ScheduleData &SD) { |
| SD.dump(os); |
| return os; |
| } |
| #endif |
| |
| friend struct GraphTraits<BoUpSLP *>; |
| friend struct DOTGraphTraits<BoUpSLP *>; |
| |
| /// Contains all scheduling data for a basic block. |
| struct BlockScheduling { |
| BlockScheduling(BasicBlock *BB) |
| : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} |
| |
| void clear() { |
| ReadyInsts.clear(); |
| ScheduleStart = nullptr; |
| ScheduleEnd = nullptr; |
| FirstLoadStoreInRegion = nullptr; |
| LastLoadStoreInRegion = nullptr; |
| |
| // Reduce the maximum schedule region size by the size of the |
| // previous scheduling run. |
| ScheduleRegionSizeLimit -= ScheduleRegionSize; |
| if (ScheduleRegionSizeLimit < MinScheduleRegionSize) |
| ScheduleRegionSizeLimit = MinScheduleRegionSize; |
| ScheduleRegionSize = 0; |
| |
| // Make a new scheduling region, i.e. all existing ScheduleData is not |
| // in the new region yet. |
| ++SchedulingRegionID; |
| } |
| |
| ScheduleData *getScheduleData(Value *V) { |
| ScheduleData *SD = ScheduleDataMap[V]; |
| if (SD && SD->SchedulingRegionID == SchedulingRegionID) |
| return SD; |
| return nullptr; |
| } |
| |
| ScheduleData *getScheduleData(Value *V, Value *Key) { |
| if (V == Key) |
| return getScheduleData(V); |
| auto I = ExtraScheduleDataMap.find(V); |
| if (I != ExtraScheduleDataMap.end()) { |
| ScheduleData *SD = I->second[Key]; |
| if (SD && SD->SchedulingRegionID == SchedulingRegionID) |
| return SD; |
| } |
| return nullptr; |
| } |
| |
| bool isInSchedulingRegion(ScheduleData *SD) const { |
| return SD->SchedulingRegionID == SchedulingRegionID; |
| } |
| |
| /// Marks an instruction as scheduled and puts all dependent ready |
| /// instructions into the ready-list. |
| template <typename ReadyListType> |
| void schedule(ScheduleData *SD, ReadyListType &ReadyList) { |
| SD->IsScheduled = true; |
| LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); |
| |
| ScheduleData *BundleMember = SD; |
| while (BundleMember) { |
| if (BundleMember->Inst != BundleMember->OpValue) { |
| BundleMember = BundleMember->NextInBundle; |
| continue; |
| } |
| // Handle the def-use chain dependencies. |
| |
| // Decrement the unscheduled counter and insert to ready list if ready. |
| auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { |
| doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { |
| if (OpDef && OpDef->hasValidDependencies() && |
| OpDef->incrementUnscheduledDeps(-1) == 0) { |
| // There are no more unscheduled dependencies after |
| // decrementing, so we can put the dependent instruction |
| // into the ready list. |
| ScheduleData *DepBundle = OpDef->FirstInBundle; |
| assert(!DepBundle->IsScheduled && |
| "already scheduled bundle gets ready"); |
| ReadyList.insert(DepBundle); |
| LLVM_DEBUG(dbgs() |
| << "SLP: gets ready (def): " << *DepBundle << "\n"); |
| } |
| }); |
| }; |
| |
| // If BundleMember is a vector bundle, its operands may have been |
| // reordered duiring buildTree(). We therefore need to get its operands |
| // through the TreeEntry. |
| if (TreeEntry *TE = BundleMember->TE) { |
| int Lane = BundleMember->Lane; |
| assert(Lane >= 0 && "Lane not set"); |
| |
| // Since vectorization tree is being built recursively this assertion |
| // ensures that the tree entry has all operands set before reaching |
| // this code. Couple of exceptions known at the moment are extracts |
| // where their second (immediate) operand is not added. Since |
| // immediates do not affect scheduler behavior this is considered |
| // okay. |
| auto *In = TE->getMainOp(); |
| assert(In && |
| (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) || |
| In->getNumOperands() == TE->getNumOperands()) && |
| "Missed TreeEntry operands?"); |
| (void)In; // fake use to avoid build failure when assertions disabled |
| |
| for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); |
| OpIdx != NumOperands; ++OpIdx) |
| if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) |
| DecrUnsched(I); |
| } else { |
| // If BundleMember is a stand-alone instruction, no operand reordering |
| // has taken place, so we directly access its operands. |
| for (Use &U : BundleMember->Inst->operands()) |
| if (auto *I = dyn_cast<Instruction>(U.get())) |
| DecrUnsched(I); |
| } |
| // Handle the memory dependencies. |
| for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { |
| if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { |
| // There are no more unscheduled dependencies after decrementing, |
| // so we can put the dependent instruction into the ready list. |
| ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; |
| assert(!DepBundle->IsScheduled && |
| "already scheduled bundle gets ready"); |
| ReadyList.insert(DepBundle); |
| LLVM_DEBUG(dbgs() |
| << "SLP: gets ready (mem): " << *DepBundle << "\n"); |
| } |
| } |
| BundleMember = BundleMember->NextInBundle; |
| } |
| } |
| |
| void doForAllOpcodes(Value *V, |
| function_ref<void(ScheduleData *SD)> Action) { |
| if (ScheduleData *SD = getScheduleData(V)) |
| Action(SD); |
| auto I = ExtraScheduleDataMap.find(V); |
| if (I != ExtraScheduleDataMap.end()) |
| for (auto &P : I->second) |
| if (P.second->SchedulingRegionID == SchedulingRegionID) |
| Action(P.second); |
| } |
| |
| /// Put all instructions into the ReadyList which are ready for scheduling. |
| template <typename ReadyListType> |
| void initialFillReadyList(ReadyListType &ReadyList) { |
| for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { |
| doForAllOpcodes(I, [&](ScheduleData *SD) { |
| if (SD->isSchedulingEntity() && SD->isReady()) { |
| ReadyList.insert(SD); |
| LLVM_DEBUG(dbgs() |
| << "SLP: initially in ready list: " << *I << "\n"); |
| } |
| }); |
| } |
| } |
| |
| /// Checks if a bundle of instructions can be scheduled, i.e. has no |
| /// cyclic dependencies. This is only a dry-run, no instructions are |
| /// actually moved at this stage. |
| /// \returns the scheduling bundle. The returned Optional value is non-None |
| /// if \p VL is allowed to be scheduled. |
| Optional<ScheduleData *> |
| tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, |
| const InstructionsState &S); |
| |
| /// Un-bundles a group of instructions. |
| void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); |
| |
| /// Allocates schedule data chunk. |
| ScheduleData *allocateScheduleDataChunks(); |
| |
| /// Extends the scheduling region so that V is inside the region. |
| /// \returns true if the region size is within the limit. |
| bool extendSchedulingRegion(Value *V, const InstructionsState &S); |
| |
| /// Initialize the ScheduleData structures for new instructions in the |
| /// scheduling region. |
| void initScheduleData(Instruction *FromI, Instruction *ToI, |
| ScheduleData *PrevLoadStore, |
| ScheduleData *NextLoadStore); |
| |
| /// Updates the dependency information of a bundle and of all instructions/ |
| /// bundles which depend on the original bundle. |
| void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, |
| BoUpSLP *SLP); |
| |
| /// Sets all instruction in the scheduling region to un-scheduled. |
| void resetSchedule(); |
| |
| BasicBlock *BB; |
| |
| /// Simple memory allocation for ScheduleData. |
| std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; |
| |
| /// The size of a ScheduleData array in ScheduleDataChunks. |
| int ChunkSize; |
| |
| /// The allocator position in the current chunk, which is the last entry |
| /// of ScheduleDataChunks. |
| int ChunkPos; |
| |
| /// Attaches ScheduleData to Instruction. |
| /// Note that the mapping survives during all vectorization iterations, i.e. |
| /// ScheduleData structures are recycled. |
| DenseMap<Value *, ScheduleData *> ScheduleDataMap; |
| |
| /// Attaches ScheduleData to Instruction with the leading key. |
| DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>> |
| ExtraScheduleDataMap; |
| |
| struct ReadyList : SmallVector<ScheduleData *, 8> { |
| void insert(ScheduleData *SD) { push_back(SD); } |
| }; |
| |
| /// The ready-list for scheduling (only used for the dry-run). |
| ReadyList ReadyInsts; |
| |
| /// The first instruction of the scheduling region. |
| Instruction *ScheduleStart = nullptr; |
| |
| /// The first instruction _after_ the scheduling region. |
| Instruction *ScheduleEnd = nullptr; |
| |
| /// The first memory accessing instruction in the scheduling region |
| /// (can be null). |
| ScheduleData *FirstLoadStoreInRegion = nullptr; |
| |
| /// The last memory accessing instruction in the scheduling region |
| /// (can be null). |
| ScheduleData *LastLoadStoreInRegion = nullptr; |
| |
| /// The current size of the scheduling region. |
| int ScheduleRegionSize = 0; |
| |
| /// The maximum size allowed for the scheduling region. |
| int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; |
| |
| /// The ID of the scheduling region. For a new vectorization iteration this |
| /// is incremented which "removes" all ScheduleData from the region. |
| // Make sure that the initial SchedulingRegionID is greater than the |
| // initial SchedulingRegionID in ScheduleData (which is 0). |
| int SchedulingRegionID = 1; |
| }; |
| |
| /// Attaches the BlockScheduling structures to basic blocks. |
| MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; |
| |
| /// Performs the "real" scheduling. Done before vectorization is actually |
| /// performed in a basic block. |
| void scheduleBlock(BlockScheduling *BS); |
| |
| /// List of users to ignore during scheduling and that don't need extracting. |
| ArrayRef<Value *> UserIgnoreList; |
| |
| using OrdersType = SmallVector<unsigned, 4>; |
| /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of |
| /// sorted SmallVectors of unsigned. |
| struct OrdersTypeDenseMapInfo { |
| static OrdersType getEmptyKey() { |
| OrdersType V; |
| V.push_back(~1U); |
| return V; |
| } |
| |
| static OrdersType getTombstoneKey() { |
| OrdersType V; |
| V.push_back(~2U); |
| return V; |
| } |
| |
| static unsigned getHashValue(const OrdersType &V) { |
| return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); |
| } |
| |
| static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { |
| return LHS == RHS; |
| } |
| }; |
| |
| /// Contains orders of operations along with the number of bundles that have |
| /// operations in this order. It stores only those orders that require |
| /// reordering, if reordering is not required it is counted using \a |
| /// NumOpsWantToKeepOriginalOrder. |
| DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder; |
| /// Number of bundles that do not require reordering. |
| unsigned NumOpsWantToKeepOriginalOrder = 0; |
| |
| // Analysis and block reference. |
| Function *F; |
| ScalarEvolution *SE; |
| TargetTransformInfo *TTI; |
| TargetLibraryInfo *TLI; |
| AAResults *AA; |
| LoopInfo *LI; |
| DominatorTree *DT; |
| AssumptionCache *AC; |
| DemandedBits *DB; |
| const DataLayout *DL; |
| OptimizationRemarkEmitter *ORE; |
| |
| unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. |
| unsigned MinVecRegSize; // Set by cl::opt (default: 128). |
| |
| /// Instruction builder to construct the vectorized tree. |
| IRBuilder<> Builder; |
| |
| /// A map of scalar integer values to the smallest bit width with which they |
| /// can legally be represented. The values map to (width, signed) pairs, |
| /// where "width" indicates the minimum bit width and "signed" is True if the |
| /// value must be signed-extended, rather than zero-extended, back to its |
| /// original width. |
| MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; |
| }; |
| |
| } // end namespace slpvectorizer |
| |
| template <> struct GraphTraits<BoUpSLP *> { |
| using TreeEntry = BoUpSLP::TreeEntry; |
| |
| /// NodeRef has to be a pointer per the GraphWriter. |
| using NodeRef = TreeEntry *; |
| |
| using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; |
| |
| /// Add the VectorizableTree to the index iterator to be able to return |
| /// TreeEntry pointers. |
| struct ChildIteratorType |
| : public iterator_adaptor_base< |
| ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { |
| ContainerTy &VectorizableTree; |
| |
| ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, |
| ContainerTy &VT) |
| : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} |
| |
| NodeRef operator*() { return I->UserTE; } |
| }; |
| |
| static NodeRef getEntryNode(BoUpSLP &R) { |
| return R.VectorizableTree[0].get(); |
| } |
| |
| static ChildIteratorType child_begin(NodeRef N) { |
| return {N->UserTreeIndices.begin(), N->Container}; |
| } |
| |
| static ChildIteratorType child_end(NodeRef N) { |
| return {N->UserTreeIndices.end(), N->Container}; |
| } |
| |
| /// For the node iterator we just need to turn the TreeEntry iterator into a |
| /// TreeEntry* iterator so that it dereferences to NodeRef. |
| class nodes_iterator { |
| using ItTy = ContainerTy::iterator; |
| ItTy It; |
| |
| public: |
| nodes_iterator(const ItTy &It2) : It(It2) {} |
| NodeRef operator*() { return It->get(); } |
| nodes_iterator operator++() { |
| ++It; |
| return *this; |
| } |
| bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } |
| }; |
| |
| static nodes_iterator nodes_begin(BoUpSLP *R) { |
| return nodes_iterator(R->VectorizableTree.begin()); |
| } |
| |
| static nodes_iterator nodes_end(BoUpSLP *R) { |
| return nodes_iterator(R->VectorizableTree.end()); |
| } |
| |
| static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } |
| }; |
| |
| template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { |
| using TreeEntry = BoUpSLP::TreeEntry; |
| |
| DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} |
| |
| std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { |
| std::string Str; |
| raw_string_ostream OS(Str); |
| if (isSplat(Entry->Scalars)) { |
| OS << "<splat> " << *Entry->Scalars[0]; |
| return Str; |
| } |
| for (auto V : Entry->Scalars) { |
| OS << *V; |
| if (std::any_of( |
| R->ExternalUses.begin(), R->ExternalUses.end(), |
| [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; })) |
| OS << " <extract>"; |
| OS << "\n"; |
| } |
| return Str; |
| } |
| |
| static std::string getNodeAttributes(const TreeEntry *Entry, |
| const BoUpSLP *) { |
| if (Entry->State == TreeEntry::NeedToGather) |
| return "color=red"; |
| return ""; |
| } |
| }; |
| |
| } // end namespace llvm |
| |
| BoUpSLP::~BoUpSLP() { |
| for (const auto &Pair : DeletedInstructions) { |
| // Replace operands of ignored instructions with Undefs in case if they were |
| // marked for deletion. |
| if (Pair.getSecond()) { |
| Value *Undef = UndefValue::get(Pair.getFirst()->getType()); |
| Pair.getFirst()->replaceAllUsesWith(Undef); |
| } |
| Pair.getFirst()->dropAllReferences(); |
| } |
| for (const auto &Pair : DeletedInstructions) { |
| assert(Pair.getFirst()->use_empty() && |
| "trying to erase instruction with users."); |
| Pair.getFirst()->eraseFromParent(); |
| } |
| assert(!verifyFunction(*F, &dbgs())); |
| } |
| |
| void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { |
| for (auto *V : AV) { |
| if (auto *I = dyn_cast<Instruction>(V)) |
| eraseInstruction(I, /*ReplaceWithUndef=*/true); |
| }; |
| } |
| |
| void BoUpSLP::buildTree(ArrayRef<Value *> Roots, |
| ArrayRef<Value *> UserIgnoreLst) { |
| ExtraValueToDebugLocsMap ExternallyUsedValues; |
| buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); |
| } |
| |
| void BoUpSLP::buildTree(ArrayRef<Value *> Roots, |
| ExtraValueToDebugLocsMap &ExternallyUsedValues, |
| ArrayRef<Value *> UserIgnoreLst) { |
| deleteTree(); |
| UserIgnoreList = UserIgnoreLst; |
| if (!allSameType(Roots)) |
| return; |
| buildTree_rec(Roots, 0, EdgeInfo()); |
| |
| // Collect the values that we need to extract from the tree. |
| for (auto &TEPtr : VectorizableTree) { |
| TreeEntry *Entry = TEPtr.get(); |
| |
| // No need to handle users of gathered values. |
| if (Entry->State == TreeEntry::NeedToGather) |
| continue; |
| |
| // For each lane: |
| for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { |
| Value *Scalar = Entry->Scalars[Lane]; |
| int FoundLane = Lane; |
| if (!Entry->ReuseShuffleIndices.empty()) { |
| FoundLane = |
| std::distance(Entry->ReuseShuffleIndices.begin(), |
| llvm::find(Entry->ReuseShuffleIndices, FoundLane)); |
| } |
| |
| // Check if the scalar is externally used as an extra arg. |
| auto ExtI = ExternallyUsedValues.find(Scalar); |
| if (ExtI != ExternallyUsedValues.end()) { |
| LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " |
| << Lane << " from " << *Scalar << ".\n"); |
| ExternalUses.emplace_back(Scalar, nullptr, FoundLane); |
| } |
| for (User *U : Scalar->users()) { |
| LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); |
| |
| Instruction *UserInst = dyn_cast<Instruction>(U); |
| if (!UserInst) |
| continue; |
| |
| // Skip in-tree scalars that become vectors |
| if (TreeEntry *UseEntry = getTreeEntry(U)) { |
| Value *UseScalar = UseEntry->Scalars[0]; |
| // Some in-tree scalars will remain as scalar in vectorized |
| // instructions. If that is the case, the one in Lane 0 will |
| // be used. |
| if (UseScalar != U || |
| !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { |
| LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U |
| << ".\n"); |
| assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); |
| continue; |
| } |
| } |
| |
| // Ignore users in the user ignore list. |
| if (is_contained(UserIgnoreList, UserInst)) |
| continue; |
| |
| LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " |
| << Lane << " from " << *Scalar << ".\n"); |
| ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); |
| } |
| } |
| } |
| } |
| |
| void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, |
| const EdgeInfo &UserTreeIdx) { |
| assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); |
| |
| InstructionsState S = getSameOpcode(VL); |
| if (Depth == RecursionMaxDepth) { |
| LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| |
| // Don't handle vectors. |
| if (S.OpValue->getType()->isVectorTy()) { |
| LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| |
| if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) |
| if (SI->getValueOperand()->getType()->isVectorTy()) { |
| LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| |
| // If all of the operands are identical or constant we have a simple solution. |
| if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { |
| LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| |
| // We now know that this is a vector of instructions of the same type from |
| // the same block. |
| |
| // Don't vectorize ephemeral values. |
| for (Value *V : VL) { |
| if (EphValues.count(V)) { |
| LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V |
| << ") is ephemeral.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| } |
| |
| // Check if this is a duplicate of another entry. |
| if (TreeEntry *E = getTreeEntry(S.OpValue)) { |
| LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); |
| if (!E->isSame(VL)) { |
| LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| // Record the reuse of the tree node. FIXME, currently this is only used to |
| // properly draw the graph rather than for the actual vectorization. |
| E->UserTreeIndices.push_back(UserTreeIdx); |
| LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue |
| << ".\n"); |
| return; |
| } |
| |
| // Check that none of the instructions in the bundle are already in the tree. |
| for (Value *V : VL) { |
| auto *I = dyn_cast<Instruction>(V); |
| if (!I) |
| continue; |
| if (getTreeEntry(I)) { |
| LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V |
| << ") is already in tree.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| } |
| |
| // If any of the scalars is marked as a value that needs to stay scalar, then |
| // we need to gather the scalars. |
| // The reduction nodes (stored in UserIgnoreList) also should stay scalar. |
| for (Value *V : VL) { |
| if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { |
| LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| } |
| |
| // Check that all of the users of the scalars that we want to vectorize are |
| // schedulable. |
| auto *VL0 = cast<Instruction>(S.OpValue); |
| BasicBlock *BB = VL0->getParent(); |
| |
| if (!DT->isReachableFromEntry(BB)) { |
| // Don't go into unreachable blocks. They may contain instructions with |
| // dependency cycles which confuse the final scheduling. |
| LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| |
| // Check that every instruction appears once in this bundle. |
| SmallVector<unsigned, 4> ReuseShuffleIndicies; |
| SmallVector<Value *, 4> UniqueValues; |
| DenseMap<Value *, unsigned> UniquePositions; |
| for (Value *V : VL) { |
| auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); |
| ReuseShuffleIndicies.emplace_back(Res.first->second); |
| if (Res.second) |
| UniqueValues.emplace_back(V); |
| } |
| size_t NumUniqueScalarValues = UniqueValues.size(); |
| if (NumUniqueScalarValues == VL.size()) { |
| ReuseShuffleIndicies.clear(); |
| } else { |
| LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); |
| if (NumUniqueScalarValues <= 1 || |
| !llvm::isPowerOf2_32(NumUniqueScalarValues)) { |
| LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); |
| return; |
| } |
| VL = UniqueValues; |
| } |
| |
| auto &BSRef = BlocksSchedules[BB]; |
| if (!BSRef) |
| BSRef = std::make_unique<BlockScheduling>(BB); |
| |
| BlockScheduling &BS = *BSRef.get(); |
| |
| Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); |
| if (!Bundle) { |
| LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); |
| assert((!BS.getScheduleData(VL0) || |
| !BS.getScheduleData(VL0)->isPartOfBundle()) && |
| "tryScheduleBundle should cancelScheduling on failure"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, |
| ReuseShuffleIndicies); |
| return; |
| } |
| LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); |
| |
| unsigned ShuffleOrOp = S.isAltShuffle() ? |
| (unsigned) Instruction::ShuffleVector : S.getOpcode(); |
| switch (ShuffleOrOp) { |
| case Instruction::PHI: { |
| auto *PH = cast<PHINode>(VL0); |
| |
| // Check for terminator values (e.g. invoke). |
| for (unsigned j = 0; j < VL.size(); ++j) |
| for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { |
| Instruction *Term = dyn_cast<Instruction>( |
| cast<PHINode>(VL[j])->getIncomingValueForBlock( |
| PH->getIncomingBlock(i))); |
| if (Term && Term->isTerminator()) { |
| LLVM_DEBUG(dbgs() |
| << "SLP: Need to swizzle PHINodes (terminator use).\n"); |
| BS.cancelScheduling(VL, VL0); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, |
| ReuseShuffleIndicies); |
| return; |
| } |
| } |
| |
| TreeEntry *TE = |
| newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); |
| LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); |
| |
| // Keeps the reordered operands to avoid code duplication. |
| SmallVector<ValueList, 2> OperandsVec; |
| for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { |
| ValueList Operands; |
| // Prepare the operand vector. |
| for (Value *j : VL) |
| Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( |
| PH->getIncomingBlock(i))); |
| TE->setOperand(i, Operands); |
| OperandsVec.push_back(Operands); |
| } |
| for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) |
| buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); |
| return; |
| } |
| case Instruction::ExtractValue: |
| case Instruction::ExtractElement: { |
| OrdersType CurrentOrder; |
| bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); |
| if (Reuse) { |
| LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); |
| ++NumOpsWantToKeepOriginalOrder; |
| newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| ReuseShuffleIndicies); |
| // This is a special case, as it does not gather, but at the same time |
| // we are not extending buildTree_rec() towards the operands. |
| ValueList Op0; |
| Op0.assign(VL.size(), VL0->getOperand(0)); |
| VectorizableTree.back()->setOperand(0, Op0); |
| return; |
| } |
| if (!CurrentOrder.empty()) { |
| LLVM_DEBUG({ |
| dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " |
| "with order"; |
| for (unsigned Idx : CurrentOrder) |
| dbgs() << " " << Idx; |
| dbgs() << "\n"; |
| }); |
| // Insert new order with initial value 0, if it does not exist, |
| // otherwise return the iterator to the existing one. |
| auto StoredCurrentOrderAndNum = |
| NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; |
| ++StoredCurrentOrderAndNum->getSecond(); |
| newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| ReuseShuffleIndicies, |
| StoredCurrentOrderAndNum->getFirst()); |
| // This is a special case, as it does not gather, but at the same time |
| // we are not extending buildTree_rec() towards the operands. |
| ValueList Op0; |
| Op0.assign(VL.size(), VL0->getOperand(0)); |
| VectorizableTree.back()->setOperand(0, Op0); |
| return; |
| } |
| LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, |
| ReuseShuffleIndicies); |
| BS.cancelScheduling(VL, VL0); |
| return; |
| } |
| case Instruction::Load: { |
| // Check that a vectorized load would load the same memory as a scalar |
| // load. For example, we don't want to vectorize loads that are smaller |
| // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM |
| // treats loading/storing it as an i8 struct. If we vectorize loads/stores |
| // from such a struct, we read/write packed bits disagreeing with the |
| // unvectorized version. |
| Type *ScalarTy = VL0->getType(); |
| |
| if (DL->getTypeSizeInBits(ScalarTy) != |
| DL->getTypeAllocSizeInBits(ScalarTy)) { |
| BS.cancelScheduling(VL, VL0); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, |
| ReuseShuffleIndicies); |
| LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); |
| return; |
| } |
| |
| // Make sure all loads in the bundle are simple - we can't vectorize |
| // atomic or volatile loads. |
| SmallVector<Value *, 4> PointerOps(VL.size()); |
| auto POIter = PointerOps.begin(); |
| for (Value *V : VL) { |
| auto *L = cast<LoadInst>(V); |
| if (!L->isSimple()) { |
| BS.cancelScheduling(VL, VL0); |
| newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, |
| ReuseShuffleIndicies); |
| LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); |
| return; |
| } |
| *POIter = L->getPointerOperand(); |
| ++POIter; |
| } |
| |
| OrdersType CurrentOrder; |
| // Check the order of pointer operands. |
| if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { |
| Value *Ptr0; |
| Value *PtrN; |
| if (CurrentOrder.empty()) { |
| Ptr0 = PointerOps.front(); |
| PtrN = PointerOps.back(); |
| } else { |
| Ptr0 = PointerOps[CurrentOrder.front()]; |
| PtrN = PointerOps[CurrentOrder.back()]; |
| } |
| const SCEV *Scev0 = SE->getSCEV(Ptr0); |
| const SCEV *ScevN = SE->getSCEV(PtrN); |
| const auto *Diff = |
| dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); |
| uint64_t Size = DL->getTypeAllocSize(ScalarTy); |
| // Check that the sorted loads are consecutive. |
| if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { |
| if (CurrentOrder.empty()) { |
| // Original loads are consecutive and does not require reordering. |
| ++NumOpsWantToKeepOriginalOrder; |
| TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, |
| UserTreeIdx, ReuseShuffleIndicies); |
| TE->setOperandsInOrder(); |
| LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); |
| } else {<
|