| //===- LoopEmitter.cpp ----------------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "LoopEmitter.h" |
| #include "CodegenUtils.h" |
| |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/Bufferization/IR/Bufferization.h" |
| #include "mlir/Dialect/Linalg/IR/Linalg.h" |
| #include "mlir/Dialect/Linalg/Utils/Utils.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/SCF/IR/SCF.h" |
| #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h" |
| #include "mlir/Dialect/Tensor/IR/Tensor.h" |
| #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| |
| using namespace mlir; |
| using namespace mlir::sparse_tensor; |
| |
| //===----------------------------------------------------------------------===// |
| // File local shorthand macros |
| //===----------------------------------------------------------------------===// |
| |
| #define CMPI(p, l, r) \ |
| (builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r)) \ |
| .getResult()) |
| |
| #define C_IDX(v) (constantIndex(builder, loc, (v))) |
| #define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs))) |
| #define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs))) |
| #define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs))) |
| #define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs))) |
| #define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs))) |
| #define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs))) |
| #define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs))) |
| #define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r))) |
| |
| //===----------------------------------------------------------------------===// |
| // Debugging utils |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef NDEBUG |
| LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder, |
| Location loc, Value memref) { |
| memref = builder.create<memref::CastOp>( |
| loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref); |
| createFuncCall(builder, loc, "printMemrefInd", TypeRange{}, |
| ValueRange{memref}, EmitCInterface::On); |
| } |
| #endif |
| |
| //===----------------------------------------------------------------------===// |
| // File local helper functions. |
| //===----------------------------------------------------------------------===// |
| |
| // For index reduction loops, since the tensor are sliced into non-continuous |
| // fragments, we need a triple [pLo, pHi, pPtr], in which the pair (pLo, pHi) |
| // specifies the range of the fragment, and pPtr specifies the index of the |
| // corresponding fragment in the child level (i.e., a pointer to the sliced |
| // position array). |
| static constexpr unsigned kSliceIterWidth = 3; |
| |
| static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor, |
| Level lvl) { |
| auto enc = getSparseTensorEncoding(tensor.getType()); |
| return createOrFoldSliceOffsetOp(builder, loc, tensor, toDim(enc, lvl)); |
| } |
| |
| static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor, |
| Level lvl) { |
| auto enc = getSparseTensorEncoding(tensor.getType()); |
| return createOrFoldSliceStrideOp(builder, loc, tensor, toDim(enc, lvl)); |
| } |
| |
| /// Converts a coordinate relative to the slice to the coordinate relative |
| /// to the underlying tensor. |
| // FIXME: that description says "sliceCrd -> tensorCrd"; but the function |
| // name suggests it should be "tensorCrd -> sliceCrd". |
| static Value toSliceCrd(OpBuilder &builder, Location loc, Value crd, |
| Value offset, Value stride, Value tensor, Level lvl) { |
| // tensorCrd = sliceCrd * stride + offset |
| return ADDI(MULI(crd, stride), offset); |
| } |
| |
| /// Generates code to compute the *absolute* offset of the slice based on the |
| /// provide minimum coordinates in the slice. |
| /// E.g., when reducing d0 + d1 + d2, we need two slices to fully reduced the |
| /// expression, i,e, s1 = slice(T, d0), s2 = slice(s1, d1). The *absolute* |
| /// offset is the offset computed relative to the initial tensors T. |
| /// |
| /// When isNonEmpty == true, the computed offset is meaningless and should not |
| /// be used during runtime, the method generates code to return 0 currently in |
| /// that case. |
| /// |
| /// offset = isNonEmpty && minCrd >= size ? minCrd - size + 1 : 0; |
| static Value offsetFromMinCoord(OpBuilder &builder, Location loc, Value minCrd, |
| Value size, Value isNonEmpty) { |
| Value geSize = CMPI(uge, minCrd, size); |
| Value pred = ANDI(isNonEmpty, geSize); |
| // Computes minCrd - size + 1 |
| Value mms = SUBI(ADDI(minCrd, C_IDX(1)), size); |
| // This is the absolute offset related to the underly tensor. |
| return SELECT(pred, mms, C_IDX(0)); |
| } |
| |
| /// Converts a coordinate relative to the underlying tensor to the coordinate |
| /// relative to the slice, returns a extra reminder value |
| // FIXME: that description says "tensorCrd -> sliceCrd"; but the function |
| // name suggests it should be "sliceCrd -> tensorCrd". |
| static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc, |
| Value crd, Value offset, |
| Value stride, Value tensor, |
| Level lvl) { |
| // sliceCrd = (tensorCrd - offset) / stride |
| crd = SUBI(crd, offset); |
| Value rem = REMUI(crd, stride); |
| crd = DIVUI(crd, stride); |
| return std::make_pair(crd, rem); |
| } |
| |
| // Generates a bool value for while loop condition that tries to iterate over a |
| // fully reduced level with affine index expression. |
| static Value genSparseReducedAffineCond(OpBuilder &builder, Location loc, |
| Value crdBuf, Value crdHi, Value posit, |
| Value posHi) { |
| Value inBound = CMPI(ult, posit, posHi); |
| auto ifOp = |
| builder.create<scf::IfOp>(loc, builder.getI1Type(), inBound, true); |
| // if (inbound) |
| // yield coord < crdHi |
| builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); |
| Value crd = genIndexLoad(builder, loc, crdBuf, posit); |
| YIELD(CMPI(ult, crd, crdHi)); |
| // else |
| // yield false |
| builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); |
| YIELD(constantI1(builder, loc, false)); |
| |
| builder.setInsertionPointAfter(ifOp); |
| return ifOp.getResult(0); |
| } |
| |
| // Helper functions that load/store into the position buffer for slice-driven |
| // loops. |
| // The sliced pointer buffer is organized as: |
| // [[pLo0, pLo1, pLo2, ...], |
| // [pHi0, pHi1, pHi2, ...], |
| // [pNx0, pNx1, pNx2, ...]] |
| static Value allocSlicePosBuf(OpBuilder &builder, Location loc, |
| Value tupleCnt) { |
| Value bufSz = MULI(tupleCnt, C_IDX(kSliceIterWidth)); |
| // Additional two metadata {memSize, idx} at head. |
| return genAlloca(builder, loc, bufSz, builder.getIndexType()); |
| } |
| |
| // Gets and sets position values for slice-driven loops. |
| enum class SlicePosKind { kLo, kHi, kNext }; |
| static Value getSlicePosIdx(OpBuilder &builder, Location loc, Value posBuf, |
| Value tupleIdx, SlicePosKind posKind) { |
| Value dim = builder.create<memref::DimOp>(loc, posBuf, C_IDX(0)); |
| Value tupleCnt = DIVUI(dim, C_IDX(kSliceIterWidth)); |
| switch (posKind) { |
| case SlicePosKind::kLo: |
| return tupleIdx; |
| case SlicePosKind::kHi: |
| return ADDI(tupleIdx, tupleCnt); |
| case SlicePosKind::kNext: |
| return ADDI(tupleIdx, MULI(tupleCnt, C_IDX(2))); |
| } |
| llvm_unreachable("unexpected kind"); |
| } |
| static Value loadSlicePos(OpBuilder &builder, Location loc, Value sPosBuf, |
| Value tupleIdx, SlicePosKind posKind) { |
| return genIndexLoad(builder, loc, sPosBuf, |
| getSlicePosIdx(builder, loc, sPosBuf, tupleIdx, posKind)); |
| } |
| static void updateSlicePos(OpBuilder &builder, Location loc, Value sPosBuf, |
| Value pos, Value tupleIdx, SlicePosKind posKind) { |
| builder.create<memref::StoreOp>( |
| loc, pos, sPosBuf, |
| getSlicePosIdx(builder, loc, sPosBuf, tupleIdx, posKind)); |
| } |
| |
| std::pair<Value, Value> |
| LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd, |
| TensorId tid, Level lvl) { |
| assert(isSparseSlices[tid]); |
| Value slice = tensors[tid]; |
| Value offset = sliceOffsets[tid][lvl]; |
| Value stride = sliceStrides[tid][lvl]; |
| auto enc = getSparseTensorEncoding(slice.getType()); |
| |
| const auto [newCrd, crdRem] = |
| fromSliceCrd(builder, loc, crd, offset, stride, slice, lvl); |
| |
| SmallVector<Value, 3> conds; // at most 3 conditions |
| |
| // First, coord >= offset (skip the check if offset is known to be 0). |
| if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl); |
| !(staticOffset.has_value() && *staticOffset == 0)) { |
| auto geOffset = CMPI(uge, crd, offset); |
| conds.push_back(geOffset); |
| } |
| |
| // Second, coord_in_slice < length |
| auto ltLength = CMPI(ult, newCrd, lvlSizes[tid][lvl]); |
| conds.push_back(ltLength); |
| |
| // Third, rem == 0 (skip the check if stride is known to be 1). |
| if (auto staticStride = enc.getStaticLvlSliceStride(lvl); |
| !(staticStride.has_value() && *staticStride == 1)) { |
| auto fitStride = CMPI(eq, crdRem, C_IDX(0)); |
| conds.push_back(fitStride); |
| } |
| |
| // Must meet all condition to be a valid coordinate in slice. |
| auto pred = conds.front(); |
| for (auto cond : ValueRange(conds).drop_front()) |
| pred = ANDI(pred, cond); |
| |
| return {newCrd, pred}; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Sparse tensor loop emitter class implementations |
| //===----------------------------------------------------------------------===// |
| |
| Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid, |
| Level lvl, Value crd) { |
| Value pos = lvl == 0 ? C_IDX(0) : posits[tid][lvl - 1]; |
| Value mul = MULI(highs[tid][lvl], pos); |
| if (isSparseSlices[tid]) |
| crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl], |
| sliceStrides[tid][lvl], tensors[tid], lvl); |
| Value add = ADDI(mul, crd); |
| return add; |
| } |
| |
| Value LoopEmitter::genSegmentHigh(OpBuilder &builder, Location loc, |
| TensorId tid, Level lvl, Value pLo, |
| Value pHi) { |
| const auto coordinates = coordinatesBuffers[tid][lvl]; |
| const auto sameCrd = genIndexLoad(builder, loc, coordinates, pLo); |
| auto whileOp = builder.create<scf::WhileOp>( |
| loc, builder.getIndexType(), pLo, |
| /*beforeBuilder=*/ |
| [pHi, coordinates, sameCrd](OpBuilder &builder, Location loc, |
| ValueRange ivs) { |
| const auto pos = ivs[0]; |
| Value inBound = builder.create<arith::CmpIOp>( |
| loc, arith::CmpIPredicate::ult, pos, pHi); |
| auto ifInBound = |
| builder.create<scf::IfOp>(loc, builder.getI1Type(), inBound, true); |
| { |
| OpBuilder::InsertionGuard guard(builder); |
| // Load the next coordinates only when inbound (to avoid OOB |
| // accesses). |
| builder.setInsertionPointToStart(ifInBound.thenBlock()); |
| Value crd = genIndexLoad(builder, loc, coordinates, pos); |
| Value isSameCrd = builder.create<arith::CmpIOp>( |
| loc, arith::CmpIPredicate::eq, crd, sameCrd); |
| YIELD(isSameCrd); |
| // Else, the position is out of bound, yield false to terminate the |
| // loop. |
| builder.setInsertionPointToStart(ifInBound.elseBlock()); |
| YIELD(constantI1(builder, loc, false)); |
| } |
| builder.create<scf::ConditionOp>(loc, ifInBound.getResults()[0], ivs); |
| }, |
| /*afterBuilder=*/ |
| [](OpBuilder &builder, Location loc, ValueRange ivs) { |
| // pos ++ |
| Value nextPos = ADDI(ivs[0], C_IDX(1)); |
| YIELD(nextPos); |
| }); |
| // Return the segment high. |
| return whileOp.getResult(0); |
| } |
| |
| Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid, |
| Level lvl) { |
| // A load on the coordinates array yields the coordinate. |
| const Value mem = coordinatesBuffers[tid][lvl]; |
| /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. |
| const Value pos = posits[tid][lvl]; |
| const Value crd = genIndexLoad(builder, loc, mem, pos); |
| return crd; |
| } |
| |
| LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput, |
| bool isSparseOut, unsigned numLoops, |
| DependentLvlGetter dimGetter) { |
| initialize(tensors, loopTag, hasOutput, isSparseOut, numLoops, dimGetter); |
| } |
| |
| void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, |
| bool isSparseOut, unsigned numLoops, |
| DependentLvlGetter dimGetter) { |
| // First initialize the top-level type of the fields. |
| this->loopTag = loopTag; |
| this->hasOutput = hasOutput; |
| this->isSparseOut = isSparseOut; |
| |
| const unsigned numManifestTensors = ts.size(); |
| const unsigned synTensorId = numManifestTensors; |
| const unsigned numTensors = numManifestTensors + 1; |
| // tensors array (len == numManifestTensor). |
| this->tensors.assign(ts.begin(), ts.end()); |
| // Arrays with len == numTensor. |
| this->lvlTypes.assign(numTensors, std::vector<LevelType>()); |
| this->lvlSizes.assign(numTensors, std::vector<Value>()); |
| this->highs.assign(numTensors, std::vector<Value>()); |
| this->segHi.assign(numTensors, std::vector<Value>()); |
| this->posits.assign(numTensors, std::vector<Value>()); |
| this->coords.assign(numTensors, std::vector<Value>()); |
| this->positionsBuffers.assign(numTensors, std::vector<Value>()); |
| this->coordinatesBuffers.assign(numTensors, std::vector<Value>()); |
| this->valBuffer.assign(numTensors, nullptr); |
| this->isSparseSlices.assign(numTensors, false); |
| this->sliceOffsets.assign(numTensors, std::vector<Value>()); |
| this->sliceStrides.assign(numTensors, std::vector<Value>()); |
| |
| // These zeros will be overwritten below, but we need to initialize |
| // them to something since we'll need random-access assignment. |
| this->loopStack.reserve(numLoops); |
| this->loopSeqStack.reserve(numLoops); |
| |
| // Index-reduction related fields. |
| this->dependentLvlMap.assign( |
| numTensors, std::vector<std::vector<std::pair<TensorLevel, unsigned>>>()); |
| this->slicePosBuffer.assign(numTensors, std::vector<std::vector<Value>>()); |
| this->sliceTupleNxStartIdx.assign(numTensors, std::vector<Value>()); |
| this->sliceTupleFwdCnt.assign(numTensors, std::vector<Value>()); |
| this->trivialSlice.assign(numTensors, std::vector<bool>()); |
| this->sliceMeta.assign( |
| numTensors, std::vector<std::vector<std::pair<Value, unsigned>>>()); |
| this->sliceStack.assign(numTensors, std::vector<SliceInfo>()); |
| this->levelReducedDep.assign(numTensors, std::vector<unsigned>()); |
| |
| // Initialize nested types of `TensorId`-indexed fields. |
| for (TensorId tid = 0; tid < numTensors; tid++) { |
| Level lvlRank; |
| if (tid == synTensorId) { |
| // Synthetic tensor (conceptually) is an all-dense tensor with rank equal |
| // to the total number of loops (each level can potentially be mapped to |
| // one of the loop being generated). |
| lvlRank = numLoops; |
| lvlTypes[tid].assign(lvlRank, LevelType::Dense); |
| } else { |
| const Value t = tensors[tid]; |
| // a scalar or 0-dimension tensors |
| if (isZeroRankedTensorOrScalar(t.getType())) |
| continue; |
| |
| auto rtp = getRankedTensorType(t); |
| const SparseTensorType stt(rtp); |
| lvlRank = stt.getLvlRank(); |
| |
| if (stt.hasEncoding()) { |
| const auto enc = stt.getEncoding(); |
| isSparseSlices[tid] = enc.isSlice(); |
| for (auto lvlTp : enc.getLvlTypes()) |
| lvlTypes[tid].push_back(lvlTp); |
| } else { |
| lvlTypes[tid].assign(lvlRank, LevelType::Dense); |
| } |
| } |
| |
| // Initialize using empty value. |
| lvlSizes[tid].assign(lvlRank, Value()); |
| highs[tid].assign(lvlRank, Value()); |
| segHi[tid].assign(lvlRank, Value()); |
| posits[tid].assign(lvlRank, Value()); |
| coords[tid].assign(lvlRank, Value()); |
| positionsBuffers[tid].assign(lvlRank, Value()); |
| coordinatesBuffers[tid].assign(lvlRank, Value()); |
| sliceOffsets[tid].assign(lvlRank, Value()); |
| sliceStrides[tid].assign(lvlRank, Value()); |
| |
| // Slice-driven loops related initialization. |
| levelReducedDep[tid].assign(lvlRank, 0); |
| dependentLvlMap[tid].assign( |
| lvlRank, std::vector<std::pair<TensorLevel, unsigned>>()); |
| slicePosBuffer[tid].assign(lvlRank, std::vector<Value>()); |
| sliceTupleNxStartIdx[tid].assign(lvlRank, Value()); |
| sliceTupleFwdCnt[tid].assign(lvlRank, Value()); |
| trivialSlice[tid].assign(lvlRank, false); |
| sliceMeta[tid].assign(lvlRank, std::vector<std::pair<Value, unsigned>>()); |
| sliceStack[tid].emplace_back(/*minCrd=*/Value(), |
| /*offset=*/Value(), /*isNonEmpty*/ Value(), |
| /*posTupleNum=*/Value(), std::nullopt, 0); |
| if (dimGetter && !isSynTensor(tid)) { |
| for (Level l = 0; l < lvlRank; l++) { |
| dependentLvlMap[tid][l] = dimGetter(tid, l); |
| unsigned depends = dependentLvlMap[tid][l].size(); |
| if (depends == 0) |
| continue; |
| sliceMeta[tid][l].assign(depends, std::make_pair(nullptr, 0)); |
| // We need `depends - 1` slices to fully reduce the affine expression. |
| slicePosBuffer[tid][l].assign(depends - 1, nullptr); |
| } |
| } |
| } |
| } |
| |
| void LoopEmitter::initializeLoopEmit( |
| OpBuilder &builder, Location loc, LoopEmitter::OutputUpdater updater, |
| LoopEmitter::SynTensorBoundSetter synSetter) { |
| |
| // For every synthetic tensor, set the high bound by calling the callback. |
| if (synSetter) |
| for (unsigned i = 0, e = highs[getSynTensorId()].size(); i < e; i++) |
| highs[getSynTensorId()][i] = synSetter(builder, loc, i); |
| |
| // For every manifest tensor: |
| // * get the values buffer. |
| // * For every level: |
| // * get the positions and coordinates buffers |
| // * get/compute the level-size, which is also used as the upper-bound |
| // on positions. |
| for (TensorId t = 0, numTensors = getNumManifestTensors(); t < numTensors; |
| t++) { |
| const Value tensor = tensors[t]; |
| const auto rtp = dyn_cast<RankedTensorType>(tensor.getType()); |
| if (!rtp) |
| // Skips only scalar, zero ranked tensor still need to be bufferized and |
| // (probably) filled with zeros by users. |
| continue; |
| // FIXME: the definition of `lvlRank` looks more like a dim-rank; |
| // but the variable is used as a level everywhere below, which |
| // suggests there may be some dim/lvl confusion going on here. |
| auto stt = getSparseTensorType(tensor); |
| const Level lvlRank = stt.getLvlRank(); |
| const auto shape = rtp.getShape(); |
| const Level cooStart = stt.getCOOStart(); |
| |
| SmallVector<Value> lvlSzs; |
| for (Level l = 0; l < stt.getLvlRank(); l++) { |
| if (stt.hasEncoding()) |
| lvlSzs.push_back(builder.create<LvlOp>(loc, tensor, l)); |
| else |
| lvlSzs.push_back(builder.create<tensor::DimOp>(loc, tensor, l)); |
| } |
| |
| // Scan all levels of current tensor. |
| for (Level l = 0; l < lvlRank; l++) { |
| // This should be called only once at beginning. |
| assert(!positionsBuffers[t][l] && !coordinatesBuffers[t][l] && |
| !highs[t][l]); |
| const auto lvlTp = lvlTypes[t][l]; |
| // Handle sparse storage schemes. |
| if (isCompressedLT(lvlTp) || isLooseCompressedLT(lvlTp)) { |
| // Generate sparse primitives to obtain positions and coordinates. |
| positionsBuffers[t][l] = genToPositions(builder, loc, tensor, l); |
| coordinatesBuffers[t][l] = |
| genToCoordinates(builder, loc, tensor, l, cooStart); |
| } else if (isSingletonLT(lvlTp) || is2OutOf4LT(lvlTp)) { |
| // Singleton level, fetch coordinates. |
| coordinatesBuffers[t][l] = |
| genToCoordinates(builder, loc, tensor, l, cooStart); |
| } else { |
| // Dense level, nothing to fetch. |
| assert(isDenseLT(lvlTp)); |
| } |
| |
| // Find upper bound in current dimension. |
| highs[t][l] = lvlSizes[t][l] = lvlSzs[l]; |
| if (isSparseSlices[t]) { |
| sliceOffsets[t][l] = genSliceOffset(builder, loc, tensors[t], l); |
| sliceStrides[t][l] = genSliceStride(builder, loc, tensors[t], l); |
| } |
| } |
| |
| // Perform the required bufferization. Dense inputs materialize |
| // from the input tensors. Sparse inputs use sparse primitives to obtain the |
| // values. |
| // Delegates extra output initialization to clients. |
| bool isOutput = isOutputTensor(t); |
| Type elementType = stt.getElementType(); |
| if (!stt.hasEncoding()) { |
| // Non-annotated dense tensors. |
| BaseMemRefType denseTp = MemRefType::get(shape, elementType); |
| |
| // TODO: if we unconditionally use fully dynamic layout here, it breaks |
| // some vectorization passes which requires static stride = 1. |
| // Is it possible to call vectorization pass after bufferization? |
| if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp())) |
| denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp); |
| |
| Value denseVal = |
| builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor); |
| // Dense outputs need special handling. |
| if (isOutput && updater) |
| denseVal = updater(builder, loc, denseVal, tensor); |
| |
| valBuffer[t] = denseVal; |
| } else { |
| // Annotated sparse tensors. |
| // We also need the value buffer for all-dense annotated "sparse" |
| // tensors. |
| valBuffer[t] = genToValues(builder, loc, tensor); |
| } |
| // NOTE: we can also prepare for 0 lvl here in advance, this will hoist |
| // some loop preparation from tensor iteration, but will also (undesirably) |
| // hoist the code ouside if-conditions. |
| } |
| |
| Type indexType = builder.getIndexType(); |
| Value c0 = constantZero(builder, loc, indexType); |
| for (TensorId t = 0, e = tensors.size(); t < e; t++) { |
| auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType()); |
| if (!rtp) |
| continue; |
| |
| Level lvlRank = SparseTensorType(rtp).getLvlRank(); |
| for (Level lvl = 0; lvl < lvlRank; lvl++) { |
| if (!dependentLvlMap[t][lvl].empty()) { |
| ArrayRef<std::pair<TensorLevel, unsigned>> depLvls = |
| dependentLvlMap[t][lvl]; |
| // Needs at least two operands to form a non-trivial affine expression. |
| assert(depLvls.size() == sliceMeta[t][lvl].size()); |
| |
| Value size = c0; |
| for (int e = depLvls.size() - 1; e >= 0; e--) { |
| auto [dt, dl] = unpackTensorLevel(depLvls[e].first); |
| unsigned stride = depLvls[e].second; |
| Value stridedSize = lvlSizes[dt][dl]; |
| if (stride != 1) |
| stridedSize = MULI(stridedSize, C_IDX(stride)); |
| size = ADDI(size, stridedSize); |
| sliceMeta[t][lvl][e] = std::make_pair(size, stride); |
| } |
| } |
| } |
| } |
| localInsertPos = builder.getInsertionPoint()->getPrevNode(); |
| } |
| |
| void LoopEmitter::categorizeLoopCondition( |
| ArrayRef<TensorLevel> tidLvls, SmallVectorImpl<TensorLvlCond> &dnConds, |
| SmallVectorImpl<TensorLvlCond> &spConds) { |
| // Finds out the tensor level that we should use to generate loops. Amongs all |
| // the tensor levels, there is at most one sparse tensor level. |
| for (auto [t, l] : unpackTensorLevelRange(tidLvls)) { |
| assert(lvlTypes[t].size() > l); // Must be a valid tid, dim pair |
| auto lvlType = lvlTypes[t][l]; |
| // Must be a recognizable LT. |
| assert(isDenseLT(lvlType) || isCompressedLT(lvlType) || |
| isLooseCompressedLT(lvlType) || isSingletonLT(lvlType) || |
| is2OutOf4LT(lvlType)); |
| |
| bool isSparse = !isDenseLT(lvlType); |
| bool isSlice = isSparseSlices[t]; |
| bool isAffine = !dependentLvlMap[t][l].empty(); |
| bool isUnRedu = false; |
| // TODO: Supports affine index expression on sparse tensor slices. |
| assert(!isSlice || !isAffine); |
| |
| // Whether the affine index expression has been fully reduced or not. |
| if (!dependentLvlMap[t][l].empty()) |
| isUnRedu = !depFullyReduced(t, l); |
| |
| auto &dstVec = isSparse ? spConds : dnConds; |
| dstVec.emplace_back( |
| makeTensorLevel(t, l), |
| makeLoopCondKind(isSparse, isSlice, isAffine, isUnRedu)); |
| } |
| |
| std::stable_sort(spConds.begin(), spConds.end(), [](auto lhs, auto rhs) { |
| // AffineUnRed > Affine > Slice > Trivial |
| return static_cast<uint8_t>(lhs.second) > static_cast<uint8_t>(rhs.second); |
| }); |
| } |
| |
| void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc, |
| ArrayRef<TensorLevel> tidLvls) { |
| // TODO: sort |
| assert(loopSeqStack.size() == loopStack.size()); |
| // Prepares for all the tensors used in the current loop sequence. |
| std::vector<std::tuple<TensorId, Level, bool>> slicedTids; |
| |
| for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) { |
| if (!dependentLvlMap[tid][lvl].empty()) { |
| bool fullyRed = genSliceBegin(builder, loc, tid, lvl); |
| slicedTids.emplace_back(tid, lvl, fullyRed); |
| } else if (!isSynTensor(tid)) { |
| prepareLoopOverTensorAtLvl(builder, loc, tid, lvl); |
| } |
| } |
| |
| // Universal Index starts from 0. |
| loopSeqStack.emplace_back(C_IDX(0), std::move(slicedTids)); |
| } |
| |
| void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) { |
| assert(loopSeqStack.size() == loopStack.size() + 1); |
| |
| const auto &slicedTids = loopSeqStack.back().second; |
| |
| // Depending on whether the slice is resolved or not at current loop sequence, |
| // end them in different ways. |
| for (auto [tid, lvl, res] : slicedTids) { |
| if (!res) { |
| // If this is a unresolved-slice-driven loop, pops out the slice. |
| assert(sliceStack[tid].back().slicedOnLvl == lvl); |
| sliceStack[tid].pop_back(); |
| } |
| } |
| loopSeqStack.pop_back(); |
| } |
| |
| Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) { |
| switch (a.getKind()) { |
| case AffineExprKind::DimId: { |
| // FIXME: since the one callsite in Sparsification passes in a |
| // level-expression, the `getPosition` must in fact be a `Dimension`. |
| // However, elsewhere we have been lead to expect that `loopIdToOrd` |
| // should be indexed by `LoopId`... |
| const auto loopId = cast<AffineDimExpr>(a).getPosition(); |
| return loopStack[loopId].iv; |
| } |
| case AffineExprKind::Add: { |
| auto binOp = cast<AffineBinaryOpExpr>(a); |
| return ADDI(genAffine(builder, loc, binOp.getLHS()), |
| genAffine(builder, loc, binOp.getRHS())); |
| } |
| case AffineExprKind::Mul: { |
| auto binOp = cast<AffineBinaryOpExpr>(a); |
| return MULI(genAffine(builder, loc, binOp.getLHS()), |
| genAffine(builder, loc, binOp.getRHS())); |
| } |
| case AffineExprKind::Constant: { |
| int64_t c = cast<AffineConstantExpr>(a).getValue(); |
| return C_IDX(c); |
| } |
| default: |
| llvm_unreachable("unexpected affine subscript"); |
| } |
| } |
| |
| std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl( |
| OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value lo, |
| Value hi, MutableArrayRef<Value> reduc, bool isParallel) { |
| bool isSparseCond = isCompressedLT(lvlTypes[tid][lvl]) || |
| isLooseCompressedLT(lvlTypes[tid][lvl]) || |
| is2OutOf4LT(lvlTypes[tid][lvl]) || |
| isSingletonLT(lvlTypes[tid][lvl]); |
| // TODO: support dynamic slices. |
| // Uses the first dimension here to build the loop bound (which is also the |
| // biggest range). |
| Value step = C_IDX(1); |
| Operation *loop = nullptr; |
| Value iv; |
| if (isParallel) { |
| scf::ParallelOp parOp = |
| builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc); |
| builder.setInsertionPointToStart(parOp.getBody()); |
| assert(parOp.getNumReductions() == reduc.size()); |
| iv = parOp.getInductionVars()[0]; |
| |
| // In-place update on the reduction variable vector. |
| // Note that the init vals is not the actual reduction variables but instead |
| // used as a "special handle" to (temporarily) represent them. The |
| // expression on init vals will be moved into scf.reduce and replaced with |
| // the block arguments when exiting the loop (see exitForLoop). This is |
| // needed as we can not build the actual reduction block and get the actual |
| // reduction variable before users fill parallel loop body. |
| for (int i = 0, e = reduc.size(); i < e; i++) |
| reduc[i] = parOp.getInitVals()[i]; |
| loop = parOp; |
| } else { |
| scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc); |
| builder.setInsertionPointToStart(forOp.getBody()); |
| iv = forOp.getInductionVar(); |
| |
| // In-place update on the reduction variable vector. |
| assert(forOp.getNumRegionIterArgs() == reduc.size()); |
| for (int i = 0, e = reduc.size(); i < e; i++) |
| reduc[i] = forOp.getRegionIterArg(i); |
| loop = forOp; |
| } |
| assert(loop && iv); |
| |
| Value crd; |
| if (isSparseCond) { |
| // For COO, the position is the same across consecutive levels. |
| /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. |
| posits[tid][lvl] = iv; |
| crd = genSparseCrd(builder, loc, tid, lvl); |
| } else { |
| // Dense tensor, the coordinate is the inducation variable. |
| crd = iv; |
| } |
| |
| if (isSparseSlices[tid] && isSparseCond) { |
| // For sparse level slices, we need to filter out invalid coordinates that |
| // are not included in the slice. |
| SmallVector<Type> types; |
| for (Value red : reduc) |
| types.push_back(red.getType()); |
| |
| auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, tid, lvl); |
| bool hasReduc = !types.empty(); |
| scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, pred, |
| /*else*/ hasReduc); |
| if (hasReduc) { |
| // scf.for (a) -> v |
| // %s = scf.if (a) -> v |
| // user-generated code. |
| // else |
| // yield a |
| // yield %s |
| YIELD(ifOp.getResults()); |
| builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); |
| // On mismatch. |
| YIELD(reduc); |
| } |
| // Set the insertion point to matched branch. |
| builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); |
| crd = trans; |
| } |
| |
| assert(crd); |
| coords[tid][lvl] = crd; |
| return {loop, crd}; |
| } |
| |
| Value LoopEmitter::genWhileLoopConditions(OpBuilder &builder, Location loc, |
| ValueRange ivs, TensorLvlCond cond) { |
| auto [tid, lvl] = unpackTensorLevel(cond.first); |
| |
| switch (cond.second) { |
| case LoopCondKind::SparseCond: { |
| assert(ivs.size() == 1); |
| // We used the first level bound as the bound the collapsed set of levels. |
| return CMPI(ult, ivs.back(), highs[tid][lvl]); |
| } |
| case LoopCondKind::SparseSliceCond: { |
| assert(ivs.size() == 1); |
| return CMPI(ult, ivs.back(), highs[tid][lvl]); |
| } |
| case LoopCondKind::SparseAffineCond: { |
| assert(ivs.size() == 1); |
| |
| Value crdHi; // loop upper bound |
| { |
| OpBuilder::InsertionGuard guard(builder); |
| Operation *loop = builder.getInsertionBlock()->getParentOp(); |
| // crdHi is a loop invariant, hosit the computation outside the loop. |
| if (llvm::isa_and_nonnull<scf::WhileOp>(loop)) |
| builder.setInsertionPoint(loop); |
| auto [remSz, stride] = sliceMeta[tid][lvl].back(); |
| assert(stride == 1 && "Not yet implemented"); |
| crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, remSz); |
| } |
| assert(crdHi); |
| return genSparseReducedAffineCond(builder, loc, |
| coordinatesBuffers[tid][lvl], crdHi, |
| ivs[0], highs[tid][lvl]); |
| } |
| case LoopCondKind::SparseAffineUnRedCond: { |
| assert(ivs.size() == 3); |
| return ivs.front(); // isNonEmpty |
| } |
| default: |
| llvm_unreachable("Unhandled LoopCondKind"); |
| } |
| llvm_unreachable("Unhandled LoopCondKind"); |
| } |
| |
| std::optional<Value> LoopEmitter::genWhileLoopBody(OpBuilder &builder, |
| Location loc, ValueRange ivs, |
| TensorLvlCond cond) { |
| auto [tid, lvl] = unpackTensorLevel(cond.first); |
| |
| switch (cond.second) { |
| case LoopCondKind::SparseCond: { |
| // Updates position. For collapsed COO, the position is the same across |
| // consecutive levels. |
| posits[tid][lvl] = ivs.back(); |
| |
| // Update coordinates. |
| coords[tid][lvl] = genSparseCrd(builder, loc, tid, lvl); |
| return std::nullopt; |
| } |
| case LoopCondKind::SparseSliceCond: { |
| assert(ivs.size() == 1); |
| posits[tid][lvl] = ivs.front(); |
| Value sCrd = genSparseCrd(builder, loc, tid, lvl); |
| // Converts the coordinate loaded from the actual sparse tensor to the |
| // coordinates in the sparse slice. |
| auto [dCrd, pred] = genSliceLegitPredicate(builder, loc, sCrd, tid, lvl); |
| coords[tid][lvl] = dCrd; |
| return pred; |
| } |
| case LoopCondKind::SparseAffineCond: { |
| assert(ivs.size() == 1); |
| // Coord is the relative offset related to its parents. |
| assert(sliceStack[tid].back().depth == 1 && "TODO: not yet implement"); |
| sliceTupleFwdCnt[tid][lvl] = SUBI(ivs[0], posits[tid][lvl]); |
| // Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1] |
| Value posit = ivs[0]; |
| Value crdBuf = coordinatesBuffers[tid][lvl]; |
| // We need to substract the offset to get relative coordinates. |
| // TODO: Maybe assert relC >=0 during runtime in debug build? |
| Value absC = genIndexLoad(builder, loc, crdBuf, posit); |
| auto relC = SUBI(absC, getFinalSliceOnLvl(tid, lvl).offset); |
| posits[tid][lvl] = posit; |
| coords[tid][lvl] = relC; |
| return std::nullopt; |
| } |
| case LoopCondKind::SparseAffineUnRedCond: { |
| unsigned depth = sliceStack[tid].back().depth; |
| unsigned curStride = sliceMeta[tid][lvl][depth - 1].second; |
| assert(ivs.size() == 3); |
| |
| // Updates the current slice info |
| SliceInfo &sliceInfo = sliceStack[tid].back(); |
| sliceInfo.isNonEmpty = ivs[0]; |
| sliceInfo.minCrd = ivs[1]; |
| sliceInfo.offset = ivs[2]; |
| |
| // Crd (the value we used to coiterate) is the relative offset related to |
| // its parents, we can use the absolute offset here because when depth = 1, |
| // absOffset[lvl][depth - 1] always equals zero. |
| // TODO: Update crd =absOffset[lvl][depth] - absOffset[lvl][depth - 1] |
| assert(depth == 1 && "TODO: not yet implement"); |
| Value crd = sliceInfo.offset; |
| |
| Value onStride = constantI1(builder, loc, true); |
| if (curStride != 1) { |
| Value strideVal = C_IDX(curStride); |
| Value rem = REMUI(crd, strideVal); |
| crd = DIVUI(crd, strideVal); |
| onStride = CMPI(eq, rem, C_IDX(0)); |
| } |
| coords[tid][lvl] = crd; |
| // No extra check is needed before accessing the tensor level. |
| return onStride; |
| } |
| default: |
| llvm_unreachable("Unhandled LoopCondKind"); |
| } |
| llvm_unreachable("Unhandled LoopCondKind"); |
| } |
| |
| ValueRange LoopEmitter::genCheckedValue(OpBuilder &builder, Location loc, |
| Value pred, ValueRange curArgs, |
| TensorLvlCond cond) { |
| assert(isSparseCond(cond.second)); |
| auto [tid, lvl] = unpackTensorLevel(cond.first); |
| if (isAffineIdxUnRedCond(cond.second)) { |
| unsigned depth = sliceStack[tid].back().depth; |
| unsigned curStride = sliceMeta[tid][lvl][depth - 1].second; |
| if (curStride == 1) |
| return curArgs; |
| // Build |
| // if (onStride) { |
| // yield curSlice |
| // } else { |
| // yield nxSlice. |
| //} |
| assert(curArgs.size() == 3); |
| auto ifOp = builder.create<scf::IfOp>(loc, curArgs.getTypes(), pred, true); |
| { |
| OpBuilder::InsertionGuard guard(builder); |
| // If not all slices are legit, yield the updated value. |
| builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); |
| |
| YIELD(curArgs); |
| // If not all slices are legit, yield the updated value. |
| builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); |
| auto [nonEmpty, minCrd, offset] = |
| genSliceNextInduction(builder, loc, tid, lvl); |
| SmallVector<Value> nxSlice{nonEmpty, minCrd, offset}; |
| YIELD(nxSlice); |
| } |
| // If all slices are legit, start the user generated code. |
| return ifOp.getResults(); |
| } else { |
| // Currently only sparse slice condition need extra check. |
| assert(isSliceCond(cond.second) && isSparseCond(cond.second)); |
| assert(curArgs.size() == 1); |
| Value nextPos = ADDI(curArgs.front(), C_IDX(1)); |
| return SELECT(pred, curArgs.front(), nextPos)->getResults(); |
| } |
| llvm_unreachable("unhandled case"); |
| } |
| |
| std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls( |
| OpBuilder &builder, Location loc, ArrayRef<TensorLvlCond> spConds, |
| MutableArrayRef<Value> reduc, bool needsUniv) { |
| // NOTE: the slice driven tensor-related reduction variable must |
| // appear before normal tensors. |
| assert(!spConds.empty()); |
| |
| // The set of induction variables for the while loop. |
| SmallVector<Value> ivs; |
| // Segment sizes for induction variables used for different kinds of loop |
| // conditions. |
| SmallVector<unsigned> opSegSize; |
| |
| // Construct the while-loop with a parameter for each coordinate. |
| for (auto [tl, cKind] : spConds) { |
| auto [tid, lvl] = unpackTensorLevel(tl); |
| const auto lvlTp = lvlTypes[tid][lvl]; |
| // Dense level are handled by the shared univeral index. |
| assert(!isDenseCond(cKind)); |
| // Must be a recognizable sparse level. |
| assert(isCompressedLT(lvlTp) || isLooseCompressedLT(lvlTp) || |
| isSingletonLT(lvlTp)); |
| (void)lvlTp; |
| |
| unsigned prevSz = ivs.size(); |
| if (isAffineIdxCond(cKind)) { |
| // TODO: Support view-based reshape on sparse levels with affine index |
| // expressions. |
| if (isAffineIdxUnRedCond(cKind)) { |
| SliceInfo &sliceInfo = sliceStack[tid].back(); |
| // The order matters! |
| ivs.push_back(sliceInfo.isNonEmpty); |
| ivs.push_back(sliceInfo.minCrd); |
| ivs.push_back(sliceInfo.offset); |
| } else { |
| ivs.push_back(posits[tid][lvl]); // loop lower bound (pos low). |
| } |
| // We reduced one more dependency after entering the loop. |
| levelReducedDep[tid][lvl]++; |
| } else { |
| assert(dependentLvlMap[tid][lvl].empty()); |
| const Value pos = posits[tid][lvl]; |
| ivs.push_back(pos); |
| } |
| opSegSize.push_back(ivs.size() - prevSz); |
| } |
| |
| // The position where user-supplied reduction variable starts. |
| ivs.append(reduc.begin(), reduc.end()); |
| // Update universal index. |
| if (needsUniv) |
| ivs.push_back(loopSeqStack.back().first); |
| |
| // Ensures all operands are valid. |
| assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; })); |
| TypeRange types = ValueRange(ivs).getTypes(); |
| auto whileOp = builder.create<scf::WhileOp>(loc, types, ivs); |
| |
| SmallVector<Location> locs(types.size(), loc); |
| Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs); |
| Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs); |
| |
| // Generates loop conditions. |
| builder.setInsertionPointToStart(before); |
| ValueRange bArgs = before->getArguments(); |
| Value whileCond = nullptr; // bool values for loop condition. |
| for (auto [c, segSz] : llvm::zip_equal(spConds, opSegSize)) { |
| Value cv = genWhileLoopConditions(builder, loc, bArgs.take_front(segSz), c); |
| bArgs = bArgs.drop_front(segSz); |
| whileCond = !whileCond ? cv : ANDI(whileCond, cv); |
| } |
| // The remaining block arguments are user-provided reduction values and an |
| // optional universal index. Make sure their sizes match. |
| assert(bArgs.size() == reduc.size() + needsUniv ? 1 : 0); |
| builder.create<scf::ConditionOp>(loc, whileCond, before->getArguments()); |
| |
| // Generates loop body. |
| builder.setInsertionPointToStart(after); |
| ValueRange aArgs = after->getArguments(); |
| // Since some LoopCondKind might need extra checks to filter out invalid |
| // iterations, we maintains another array to hold the iteration arguments to |
| // yield if the checks fails. |
| SmallVector<Value> nextArgs(aArgs.begin(), aArgs.end()); |
| // A mutable alias for convenient slicing. |
| MutableArrayRef<Value> nextArgsRef = nextArgs; |
| Value extraPred = nullptr; |
| for (auto [c, segSz] : llvm::zip_equal(spConds, opSegSize)) { |
| ValueRange condArgs = aArgs.take_front(segSz); |
| auto pred = genWhileLoopBody(builder, loc, condArgs, c); |
| assert(pred.has_value() == isCondWithExtraCheck(c.second)); |
| if (pred.has_value()) { |
| // We need all extra checks to pass. |
| extraPred = extraPred == nullptr ? *pred : ANDI(*pred, extraPred); |
| ValueRange nxArgs = genCheckedValue(builder, loc, *pred, condArgs, c); |
| assert(nxArgs.size() == segSz); |
| // Update the value for cases when some check fails. |
| for (unsigned i = 0; i < segSz; i++) { |
| nextArgsRef[i] = nxArgs[i]; |
| } |
| } |
| aArgs = aArgs.drop_front(segSz); |
| nextArgsRef = nextArgsRef.drop_front(segSz); |
| } |
| |
| if (extraPred) { |
| auto ifOp = builder.create<scf::IfOp>(loc, types, extraPred, /*else*/ true); |
| // Marks this special IfOp so that Sparsification does not finalizing it. |
| ifOp->setAttr(getLoopEmitterLoopAttrName(), |
| StringAttr::get(builder.getContext(), "slice")); |
| // Links the SSA chain outside the if statement. |
| YIELD(ifOp->getResults()); |
| |
| // If not all slices are legit, yield the updated value. |
| builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); |
| YIELD(nextArgs); |
| |
| // If all slices are legit, start the user generated code. |
| builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); |
| } |
| |
| for (auto [tid, lvl] : unpackTensorLevelFromCondRange(spConds)) { |
| // Generates segment high for non-unique level. |
| if (!isUniqueLT(lvlTypes[tid][lvl])) { |
| segHi[tid][lvl] = genSegmentHigh(builder, loc, tid, lvl, posits[tid][lvl], |
| highs[tid][lvl]); |
| } |
| } |
| |
| // In-place update on reduction variable. |
| assert(aArgs.size() == reduc.size() + needsUniv ? 1 : 0); |
| for (unsigned i = 0, e = reduc.size(); i < e; i++) |
| reduc[i] = aArgs[i]; |
| |
| Value min; |
| // Finds the minimum coordinate |
| if (!needsUniv) { |
| for (auto [tid, lvl] : unpackTensorLevelFromCondRange(spConds)) { |
| const auto lvlTp = lvlTypes[tid][lvl]; |
| if (isCompressedLT(lvlTp) || isSingletonLT(lvlTp) || |
| isLooseCompressedLT(lvlTp)) { |
| const auto crd = coords[tid][lvl]; |
| if (min) { |
| Value cmp = CMPI(ult, coords[tid][lvl], min); |
| min = SELECT(cmp, coords[tid][lvl], min); |
| } else { |
| min = crd; |
| } |
| } |
| } |
| } else { |
| assert(!min); |
| // Otherwise, universal index is the minimal pos. |
| min = whileOp.getAfterArguments().back(); |
| } |
| |
| return {whileOp, min}; |
| } |
| |
| bool LoopEmitter::shouldIteratedByForLoop(ArrayRef<TensorLvlCond> sparseConds, |
| bool genDedup) { |
| assert(llvm::all_of(sparseConds, |
| [](TensorLvlCond c) { return isSparseCond(c.second); })); |
| |
| // If we need to co-iterate over two sparse tensors, we need a while loop |
| if (sparseConds.size() > 1) |
| return false; |
| |
| // We also need a while loop for levels with affine index expression and |
| // non-unique levels when deduplication is required. |
| if (sparseConds.size() == 1) { |
| auto [tid, lvl] = unpackTensorLevel(sparseConds.back().first); |
| return !isAffineIdxCond(sparseConds.back().second) && |
| !(genDedup && !isUniqueLT(lvlTypes[tid][lvl])); |
| } |
| |
| return true; |
| } |
| |
| Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls( |
| OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls, |
| MutableArrayRef<Value> reduc, bool tryParallel, bool genDedup, |
| bool needsUniv) { |
| #ifndef NDEBUG |
| // Sanity checks. |
| assert(!tidLvls.empty()); |
| for (auto [t, l] : unpackTensorLevelRange(tidLvls)) { |
| assert(!coords[t][l] || // We cannot re-enter the same level |
| !dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop |
| } |
| #endif |
| // TODO: support multiple return on parallel for? |
| tryParallel = tryParallel && reduc.size() <= 1; |
| |
| SmallVector<TensorLvlCond> spConds; |
| SmallVector<TensorLvlCond> dnConds; |
| categorizeLoopCondition(tidLvls, dnConds, spConds); |
| |
| // Only when there is at least one sparse conditions, do we really need the |
| // universal index. |
| // TODO: Maybe we should instead requires merger to pass in a valid value at |
| // the first place instead of adjusting it in LoopEmitter? |
| needsUniv = !spConds.empty() && needsUniv; |
| // The TensorLevel used for loop conditions. |
| // If there is any sparse level, we need to use the sparse condition. |
| // If all levels are dense, we can pick arbitrary one (dense slice-driven loop |
| // can be generated using a simple ForOp as well). |
| Operation *l = nullptr; |
| Value iv = nullptr; |
| SmallVector<SliceLoopInfo> sliceDrivenInfo; |
| SmallVector<TensorLevel> trivialLvls; |
| |
| // Generates loops differently depending on whether we need a slice-driven |
| // loop or a simple level traversal loop. |
| if (shouldIteratedByForLoop(spConds, genDedup) && !needsUniv) { |
| assert(spConds.size() <= 1); |
| TensorLvlCond tlCond = spConds.empty() ? dnConds.front() : spConds.front(); |
| auto loopCondKind = tlCond.second; |
| auto [tid, lvl] = unpackTensorLevel(tlCond.first); |
| Value lo = isSparseCond(loopCondKind) |
| ? posits[tid][lvl] // current offset |
| : loopSeqStack.back().first; // universal index |
| Value hi = highs[tid][lvl]; |
| if (isDenseCond(loopCondKind) && isAffineIdxCond(loopCondKind)) { |
| bool unReduc = isAffineIdxUnRedCond(loopCondKind); |
| assert(unReduc == !depFullyReduced(tid, lvl)); |
| unsigned depth = sliceStack[tid].back().depth; |
| assert(depth >= 1); |
| // The *next* slice size after reducing the current index variable. |
| auto [nxSz, nxStride] = sliceMeta[tid][lvl][depth]; |
| // The *current* stride to reduce the current index variable. |
| // E.g., for 2 * i, stride = 2. |
| unsigned stride = sliceMeta[tid][lvl][depth - 1].second; |
| hi = nxSz; |
| if (unReduc) { |
| // Adjust for loop hi for dense slice-driven loop. |
| hi = SUBI(lvlSizes[tid][lvl], hi); |
| hi = ADDI(hi, C_IDX(1)); |
| hi = DIVUI(hi, C_IDX(stride)); |
| } else { |
| // TODO: dialuted convolution. |
| assert(nxStride == 1 && "Not yet implemented."); |
| } |
| } |
| std::tie(l, iv) = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, lo, hi, |
| reduc, tryParallel); |
| // For loop condition must be a trivial condition (levels without affine |
| // index expression). |
| trivialLvls.push_back(tlCond.first); |
| } else { |
| for (auto [tl, cKind] : spConds) { |
| if (isAffineIdxCond(cKind)) { |
| auto [tid, lvl] = unpackTensorLevel(tl); |
| bool unReduc = isAffineIdxUnRedCond(cKind); |
| assert(unReduc == !depFullyReduced(tid, lvl)); |
| sliceDrivenInfo.emplace_back(tid, lvl, /*fullyReduced=*/!unReduc); |
| } else { |
| trivialLvls.push_back(tl); |
| } |
| } |
| |
| std::tie(l, iv) = |
| emitWhileLoopOverTensorsAtLvls(builder, loc, spConds, reduc, needsUniv); |
| } |
| |
| // Enter dense tensor levels. |
| enterTensorsAtDenseLvls(builder, loc, dnConds, iv, sliceDrivenInfo); |
| // NOTE: we can also prepare for next dim here in advance |
| |
| // Pushes the loop into stack. |
| loopStack.emplace_back(trivialLvls, sliceDrivenInfo, l, |
| builder.getInsertionBlock(), iv, loopTag); |
| return l; |
| } |
| |
| Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl( |
| OpBuilder &builder, Location loc, TensorId tid, Level lvl, |
| AffineExpr affine, MutableArrayRef<Value> reduc) { |
| assert(isValidLevel(tid, lvl)); |
| assert(!isa<AffineDimExpr>(affine) && !isDenseLT(lvlTypes[tid][lvl])); |
| // We can not re-enter the same level. |
| assert(!coords[tid][lvl]); |
| |
| // TODO: We should instead use a whileOp for filter loop to allow early |
| // break when exceeding (for ordered levels). |
| // TODO: There are many other potiential opportunities that we might apply in |
| // the future. E.g., we could use binary search to locate positions. |
| const Value step = C_IDX(1); |
| const Value pLo = posits[tid][lvl]; |
| const Value pHi = highs[tid][lvl]; |
| scf::ForOp forOp = builder.create<scf::ForOp>(loc, pLo, pHi, step, reduc); |
| |
| // In-place update on the reduction variable vector. |
| assert(forOp.getNumRegionIterArgs() == reduc.size()); |
| for (int i = 0, e = reduc.size(); i < e; i++) |
| reduc[i] = forOp.getRegionIterArg(i); |
| |
| builder.setInsertionPointToStart(forOp.getBody()); |
| // The induction variable gives the position. |
| const Value pos = forOp.getInductionVar(); |
| posits[tid][lvl] = pos; |
| // Generating a load on the coordinates array yields the crd. |
| const Value mem = coordinatesBuffers[tid][lvl]; |
| const Value crd = genIndexLoad(builder, loc, mem, pos); |
| coords[tid][lvl] = crd; |
| |
| // Generate an if-condition to filter out coordinates that are not |
| // equal to the result of the affine expression. |
| Value expected = genAffine(builder, loc, affine); |
| auto pred = CMPI(eq, crd, expected); |
| SmallVector<Type> types; |
| for (Value red : reduc) { |
| types.push_back(red.getType()); |
| } |
| |
| bool hasReduc = !types.empty(); |
| scf::IfOp ifOp = |
| builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc); |
| if (hasReduc) { |
| // scf.for (a) -> v |
| // %s = scf.if (a) -> v |
| // user-generated code. |
| // else |
| // yield a |
| // yield %s |
| YIELD(ifOp.getResults()); |
| builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); |
| // On mismatch. |
| YIELD(reduc); |
| } |
| // Set the insert point to matched branch. |
| builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); |
| |
| // NOTE: we can also prepare for next lvl here in advance |
| // Push the loop into stack |
| loopStack.emplace_back(ArrayRef<TensorLevel>(makeTensorLevel(tid, lvl)), |
| ArrayRef<SliceLoopInfo>(), forOp, |
| builder.getInsertionBlock(), coords[tid][lvl], |
| nullptr); |
| return forOp; |
| } |
| |
| void LoopEmitter::genDenseAffineAddress(OpBuilder &builder, Location loc, |
| TensorLevel tidLvl, |
| AffineExpr lvlExpr) { |
| auto [tid, lvl] = unpackTensorLevel(tidLvl); |
| assert(isDenseLT(lvlTypes[tid][lvl])); |
| // For dense levels, the vel-coordinate also serves as the position. |
| Value lvlCrd = genAffine(builder, loc, lvlExpr); |
| posits[tid][lvl] = genAddress(builder, loc, tid, lvl, lvlCrd); |
| } |
| |
| void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc, |
| TensorId tid, Level lvl) { |
| assert(isValidLevel(tid, lvl)); |
| const auto lvlTp = lvlTypes[tid][lvl]; |
| |
| if (isDenseLT(lvlTp)) |
| return; |
| |
| const Value c0 = C_IDX(0); |
| const Value c1 = C_IDX(1); |
| const Value c2 = C_IDX(2); |
| // Either the first level, or the previous level has been set. |
| /// FIXME: See the [CLARIFY_POSITS_LVL] note in the header. |
| assert(lvl == 0 || posits[tid][lvl - 1]); |
| if (isCompressedLT(lvlTp) || isLooseCompressedLT(lvlTp)) { |
| const Value mem = positionsBuffers[tid][lvl]; |
| |
| Value pLo = lvl == 0 ? c0 : posits[tid][lvl - 1]; |
| if (isLooseCompressedLT(lvlTp)) |
| pLo = builder.create<arith::MulIOp>(loc, pLo, c2); |
| posits[tid][lvl] = genIndexLoad(builder, loc, mem, pLo); |
| |
| const Value pHi = ADDI(pLo, c1); |
| highs[tid][lvl] = genIndexLoad(builder, loc, mem, pHi); |
| return; |
| } |
| if (isSingletonLT(lvlTp)) { |
| const Value pLo = lvl == 0 ? c0 : posits[tid][lvl - 1]; |
| posits[tid][lvl] = pLo; |
| |
| // If we are coiterating non-unique levels, then use pHi=segHi; |
| // otherwise use pHi=pLo+1. |
| // NOTE: Just because the level is non-unique, that does not |
| // guarantee that segHi is defined: because we only generate segHi |
| // whenever coiterating, in order to improve code quality for the |
| // non-coiterating cases. |
| const auto parentSegHi = segHi[tid][lvl - 1]; |
| highs[tid][lvl] = (!isUniqueLT(lvlTypes[tid][lvl - 1]) && parentSegHi) |
| ? parentSegHi |
| : ADDI(pLo, c1); |
| return; |
| } |
| if (is2OutOf4LT(lvlTp)) { |
| const Value pLo = lvl == 0 ? c0 : posits[tid][lvl - 1]; |
| // Each 2:4 block has exactly two specified elements. |
| posits[tid][lvl] = MULI(pLo, c2); |
| highs[tid][lvl] = ADDI(posits[tid][lvl], c2); |
| return; |
| } |
| llvm_unreachable("Unrecognized level-type!"); |
| } |
| |
| void LoopEmitter::enterTensorsAtDenseLvls( |
| OpBuilder &builder, Location loc, ArrayRef<TensorLvlCond> dnConds, Value iv, |
| SmallVectorImpl<SliceLoopInfo> &sliceInfo) { |
| for (auto [dnTidLvl, denseLoopCond] : dnConds) { |
| auto [tid, lvl] = unpackTensorLevel(dnTidLvl); |
| assert(isDenseLT(lvlTypes[tid][lvl])); |
| |
| if (isAffineIdxCond(denseLoopCond)) { |
| // Pushes sliced levels to build correct LoopInfo. |
| bool unReduc = isAffineIdxUnRedCond(denseLoopCond); |
| SliceInfo &info = sliceStack[tid].back(); |
| // Pushes sliced dense loop info to tell LoopEmitter how to exit it. |
| sliceInfo.emplace_back(tid, lvl, /*fullyReduced=*/!unReduc); |
| // FIXME: The offset and position iterator need to be adjusted when the |
| // slice is strided. |
| if (unReduc) { |
| assert(*info.slicedOnLvl == lvl); |
| unsigned depth = sliceStack[tid].back().depth; |
| assert(depth >= 1); |
| unsigned stride = sliceMeta[tid][lvl][depth - 1].second; |
| // Update the slice information as we enter the new loop. |
| info.minCrd = info.offset = MULI(iv, C_IDX(stride)); |
| info.isNonEmpty = constantI1(builder, loc, true); |
| } else { |
| posits[tid][lvl] = |
| genAddress(builder, loc, tid, lvl, ADDI(info.offset, iv)); |
| Value fwdCnt = lvl == 0 || trivialSlice[tid][lvl] |
| ? C_IDX(0) |
| : sliceTupleFwdCnt[tid][lvl - 1]; |
| Value sz = sliceMeta[tid][lvl].back().first; |
| Value mul = MULI(fwdCnt, sz); |
| sliceTupleFwdCnt[tid][lvl] = ADDI(mul, iv); |
| } |
| levelReducedDep[tid][lvl]++; |
| } else { |
| // Skips the synthetic tensor |
| if (isSynTensor(tid)) |
| continue; |
| // A dense level with trivial index expression. |
| assert(dependentLvlMap[tid][lvl].empty()); |
| auto enc = getSparseTensorEncoding(tensors[tid].getType()); |
| if (enc && !isSparseOutput(tid)) { |
| bool validPos = lvl == 0 || posits[tid][lvl - 1]; |
| if (!validPos) { |
| // We might not find the pos for the sparse output tensor as it is |
| // unconditionally required by the sparsification. |
| assert(isOutputTensor(tid)); |
| continue; |
| } |
| posits[tid][lvl] = genAddress(builder, loc, tid, lvl, iv); |
| // NOTE: we can also prepare for next lvl here in advance |
| } |
| } |
| } |
| } |
| |
| void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, |
| MutableArrayRef<Value> reduc) { |
| const LoopInfo &loopInfo = loopStack.back(); |
| for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) { |
| if (!reduced) { |
| SliceInfo &info = sliceStack[tid].back(); |
| assert(isDenseLT(lvlTypes[tid][lvl])); |
| assert(*info.slicedOnLvl == lvl); |
| (void)reduced; |
| info.minCrd = info.offset = info.isNonEmpty = Value(); |
| } |
| levelReducedDep[tid][lvl]--; |
| } |
| if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) { |
| if (!reduc.empty()) { |
| assert(reduc.size() == forOp.getNumResults()); |
| rewriter.create<scf::YieldOp>(loc, reduc); |
| } |
| // Exit the loop. |
| rewriter.setInsertionPointAfter(forOp); |
| // In-place update reduction variables. |
| for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++) |
| reduc[i] = forOp.getResult(i); |
| } else { |
| auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop); |
| if (!reduc.empty()) { |
| assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1); |
| Operation *redExp = reduc.front().getDefiningOp(); |
| // Reduction expression should have no use. |
| assert(redExp->getUses().empty()); |
| // This must be a binary operation. |
| // NOTE: This is users' responsibility to ensure the operation are |
| // commutative. |
| assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1); |
| |
| Value redVal = parOp.getInitVals().front(); |
| Value curVal; |
| if (redExp->getOperand(0) == redVal) |
| curVal = redExp->getOperand(1); |
| else if (redExp->getOperand(1) == redVal) |
| curVal = redExp->getOperand(0); |
| // One of the operands must be the init value (which is also the |
| // previous reduction value). |
| assert(curVal); |
| #ifndef NDEBUG |
| // The reduction expression should be the only user of the reduction val |
| // inside the parallel for. |
| unsigned numUsers = 0; |
| for (Operation *op : redVal.getUsers()) { |
| if (op->getParentOp() == parOp) |
| numUsers++; |
| } |
| assert(numUsers == 1); |
| #endif // NDEBUG |
| |
| rewriter.setInsertionPointAfter(redExp); |
| auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal); |
| // Attach to the reduction op. |
| Block *redBlock = &redOp.getRegion().getBlocks().front(); |
| rewriter.setInsertionPointToEnd(redBlock); |
| Operation *newRed = rewriter.clone(*redExp); |
| // Replaces arguments of the reduction expression by using the block |
| // arguments from scf.reduce. |
| rewriter.updateRootInPlace( |
| newRed, [&]() { newRed->setOperands(redBlock->getArguments()); }); |
| // Erases the out-dated reduction expression. |
| rewriter.eraseOp(redExp); |
| rewriter.setInsertionPointToEnd(redBlock); |
| rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0)); |
| } |
| rewriter.setInsertionPointAfter(parOp); |
| // In-place update reduction variables. |
| for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++) |
| reduc[i] = parOp.getResult(i); |
| } |
| |
| // Finished iterating a tensor, clean up |
| // We only do the clean up on for loop as while loops do not necessarily |
| // finish the iteration on a sparse tensor |
| for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.trivialTidLvls)) { |
| // Reset to null. |
| coords[tid][lvl] = Value(); |
| posits[tid][lvl] = Value(); |
| // Dense level, high is fixed. |
| if (!isDenseLT(lvlTypes[tid][lvl])) |
| highs[tid][lvl] = Value(); |
| } |
| } |
| |
| void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc, |
| MutableArrayRef<Value> reduc) { |
| const LoopInfo &loopInfo = loopStack.back(); |
| auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop); |
| Value iv = loopInfo.iv; |
| Value one = C_IDX(1); |
| |
| // Finalize the induction. Note that the induction could be performed |
| // in the individual if-branches to avoid re-evaluating the conditions. |
| // However, that would result in a rather elaborate forest of yield |
| // instructions during code generation. Moreover, performing the induction |
| // after the if-statements more closely resembles code generated by TACO. |
| unsigned o = 0; |
| SmallVector<Value> operands; |
| unsigned delta = 0; |
| for (auto [tid, lvl, resolved] : loopInfo.sliceDrivenInfo) { |
| // TODO: handle dense. |
| assert(isCompressedLT(lvlTypes[tid][lvl])); |
| levelReducedDep[tid][lvl]--; |
| if (!resolved) { |
| // TODO: support coiterating multiple slices |
| assert(loopInfo.sliceDrivenInfo.size() == 1); |
| auto [nxNonEmpty, nxMinCrd, nxAbsOffset] = |
| genSliceNextInduction(builder, loc, tid, lvl); |
| // Update while loop induction operands. |
| operands.push_back(nxNonEmpty); |
| operands.push_back(nxMinCrd); |
| operands.push_back(nxAbsOffset); |
| |
| // Update the slice stack. |
| SliceInfo &info = sliceStack[tid].back(); |
| info.isNonEmpty = whileOp.getResult(o++); |
| info.minCrd = whileOp.getResult(o++); |
| info.offset = whileOp.getResult(o++); |
| continue; |
| } |
| |
| Value forwarded = nullptr; |
| if (loopInfo.trivialTidLvls.empty() && |
| loopInfo.sliceDrivenInfo.size() == 1) { |
| // Forwards the position iterator. |
| operands.push_back(ADDI(posits[tid][lvl], one)); |
| forwarded = constantI1(builder, loc, true); |
| } else { |
| const Value pos = posits[tid][lvl]; |
| const Value nxPos = ADDI(posits[tid][lvl], one); |
| forwarded = CMPI(eq, coords[tid][lvl], iv); |
| operands.push_back(SELECT(forwarded, nxPos, pos)); |
| } |
| // The coordinate is invalid now. |
| coords[tid][lvl] = nullptr; |
| |
| // Update the position iterator as we exit the while loop. |
| posits[tid][lvl] = whileOp->getResult(o++); |
| }; |
| |
| for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.trivialTidLvls)) { |
| const auto lvlTp = lvlTypes[tid][lvl]; |
| if (isCompressedLT(lvlTp) || isSingletonLT(lvlTp) || |
| isLooseCompressedLT(lvlTp)) { |
| const Value crd = coords[tid][lvl]; |
| const Value pos = posits[tid][lvl]; |
| Value cmp = CMPI(eq, crd, iv); |
| // If the loop contains a coiteration with non-unique level, we fast |
| // forward all the duplicated coords by setting the position to the |
| // segment high. |
| Value add = |
| !isUniqueLT(lvlTypes[tid][lvl]) ? segHi[tid][lvl] : ADDI(pos, one); |
| |
| operands.push_back(SELECT(cmp, add, pos)); |
| // Following loops continue iteration from the break point of the |
| // current while loop. |
| const Value newPos = whileOp->getResult(o++); |
| // We need to define a new local variable for `tid` to avoid |
| // warnings about "captured structured bindings are a C++20 extension". |
| // FIXME(wrengr): define a helper function to capture this idiom! |
| const TensorId newTid = tid; |
| posits[newTid][lvl] = newPos; |
| |
| // The coordinate is invalid now. |
| coords[tid][lvl] = nullptr; |
| // The segment high is invalid now. |
| segHi[tid][lvl] = nullptr; |
| // highs remains unchanged. |
| } |
| } |
| |
| // Reduction value from users. |
| for (auto &i : reduc) { |
| operands.push_back(i); |
| // In place update reduction variable. |
| i = whileOp->getResult(o++); |
| } |
| |
| // An (optional) universal index. |
| if (operands.size() + delta < whileOp.getNumResults()) { |
| assert(operands.size() + delta + 1 == whileOp.getNumResults()); |
| // The last one is the universial index. |
| operands.push_back(ADDI(iv, one)); |
| // update the loop starting point of current loop sequence |
| loopSeqStack.back().first = whileOp->getResult(o++); |
| } |
| |
| assert(o == operands.size() + delta); |
| if (!operands.empty()) |
| YIELD(operands); |
| |
| builder.setInsertionPointAfter(whileOp); |
| } |
| |
| void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc, |
| MutableArrayRef<Value> reduc) { |
| // Clean up the values, it would help use to discover potential bug at a |
| // earlier stage (instead of silently using a wrong value). |
| const LoopInfo &loopInfo = loopStack.back(); |
| |
| // Sets the insertion point to the right position. |
| rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock); |
| if (!loopInfo.userCodeBlock->empty() && |
| llvm::isa<scf::YieldOp>(&loopInfo.userCodeBlock->back())) { |
| // scf::While/For inserts an implicit yield op when there is no loop |
| // iter args. In this case, we need to insert the code before the yield. |
| assert(loopInfo.userCodeBlock->back().getNumResults() == 0); |
| rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back()); |
| } |
| |
| if (llvm::isa<scf::WhileOp>(loopInfo.loop)) { |
| exitWhileLoop(rewriter, loc, reduc); |
| } else { |
| exitForLoop(rewriter, loc, reduc); |
| } |
| |
| assert(loopStack.size() == loopSeqStack.size()); |
| loopStack.pop_back(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Slice-driven loop related methods. |
| //===----------------------------------------------------------------------===// |
| |
| unsigned LoopEmitter::remDepOnLevel(TensorId tid, Level lvl) const { |
| unsigned totalDependencies = dependentLvlMap[tid][lvl].size(); |
| if (totalDependencies != 0) { |
| assert(totalDependencies >= 2); |
| return totalDependencies - levelReducedDep[tid][lvl]; |
| } |
| return totalDependencies; |
| } |
| |
| const LoopEmitter::SliceInfo &LoopEmitter::getMostRecentSliceOnLvl(TensorId tid, |
| Level lvl) { |
| // Finds the most-recent slice using a reverse iteration. |
| for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie; |
| it++) { |
| if (it->slicedOnLvl == lvl) { // the level matched |
| return *it; |
| } |
| } |
| llvm_unreachable("Failed to find sliceInfo"); |
| } |
| |
| // Generates a while loop to iterate over a slice sparse level as follows. |
| // |
| // while(coords[loopLo] < offset + size) { |
| // body_builder |
| // loopLo ++; |
| // } |
| std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop( |
| OpBuilder &builder, Location loc, Value posLo, Value posHi, Value offset, |
| Value size, TensorId tid, Level lvl, ValueRange userReduc, |
| LoopBodyBuilder bodyBuilder) { |
| Value c1 = C_IDX(1); |
| auto [sliceSz, stride] = sliceMeta[tid][lvl].back(); |
| assert(stride == 1 && "Not yet implemented"); |
| Value sliceHi = ADDI(offset, sliceSz); |
| |
| SmallVector<Value> reduc{posLo}; // loop lower bounds |
| const unsigned numMetaReduc = reduc.size(); |
| |
| // Append user required reduction value. |
| reduc.append(userReduc.begin(), userReduc.end()); |
| scf::WhileOp whileOp = builder.create<scf::WhileOp>( |
| loc, ValueRange(reduc).getTypes(), reduc, |
| /*beforeBuilder=*/ |
| [this, posHi, sliceHi, tid, lvl](OpBuilder &builder, Location loc, |
| ValueRange args) { |
| Value cond = genSparseReducedAffineCond(builder, loc, |
| coordinatesBuffers[tid][lvl], |
| sliceHi, args[0], posHi); |
| // continue if not yet break nor out of bound. |
| builder.create<scf::ConditionOp>(loc, cond, args); |
| }, |
| /*afterBuilder=*/ |
| [c1, numMetaReduc, bodyBuilder](OpBuilder &builder, Location loc, |
| ValueRange args) { |
| Value iv = args[0]; |
| TypeRange types = args.drop_front(numMetaReduc).getTypes(); |
| // The coordinate must be in bound as guaranteed by the loop |
| // condition. We generate a fake if operation here only to hide the |
| // extra loop induction variables maintained by us from users, which |
| // will be removed by later optimization pass. |
| auto ifOp = builder.create<scf::IfOp>(loc, types, |
| constantI1(builder, loc, true), |
| /*withElseBlock=*/!types.empty()); |
| { |
| // 2 reduction variable maintained by us. |
| SmallVector<Value> ifRet = args.drop_front(numMetaReduc); |
| assert(ifRet.size() == args.size() - 1); |
| |
| OpBuilder::InsertionGuard guard(builder); |
| // If coord >= sliceHi. |
| if (!ifRet.empty()) { |
| builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); |
| YIELD(ifRet); |
| } |
| |
| // If coord < sliceHi. |
| builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); |
| // Delegates to users' callback. |
| bodyBuilder(builder, loc, iv, ifRet); |
| } |
| // Marks this special ifOp to avoid sparisification finalizing it. |
| ifOp->setAttr(getLoopEmitterLoopAttrName(), |
| StringAttr::get(builder.getContext(), "slice")); |
| // Insertion point restored to after ifOp. |
| SmallVector<Value> yields; |
| // Increase induction variable. |
| yields.push_back(ADDI(iv, c1)); |
| yields.append(ifOp.getResults().begin(), ifOp.getResults().end()); |
| YIELD(yields); |
| }); |
| |
| builder.setInsertionPointAfter(whileOp); |
| return std::make_pair(whileOp, whileOp.getResults().drop_front(numMetaReduc)); |
| } |
| |
| // Generates a loop nest that traverse all the unresolved levels in between. |
| // |
| // for(int i = 0; i < slicePos.size(); i+=2) { |
| // loopLo = slicePos[i]; |
| // loopHi = slicePos[i + 1]; |
| // |
| // // Then the same loop generated by genSliceLvlTraverse above. |
| // while (loopLo < loopHI) { |
| // if (pos[loopLo] < sliceHi) { |
| // bodyBuilder(); |
| // } else { |
| // break; |
| // } |
| // loopLo ++; |
| // } |
| // } |
| ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse( |
| OpBuilder &builder, Location loc, TensorId tid, |
| ArrayRef<const SliceInfo *> unResLvls, |
| std::optional<std::pair<TensorId, Level>> firstResLvl, ValueRange userReduc, |
| LoopBodyBuilder bodyBuilder) { |
| |
| Value c0 = C_IDX(0), c1 = C_IDX(1); |
| Value pos = c0; |
| OpBuilder::InsertPoint ip; |
| SmallVector<Value> innerArgs(userReduc.begin(), userReduc.end()); |
| scf::ForOp outerMost = nullptr; // the outermost loop. |
| |
| // Wraps body builder and inserts a extra counting instruction at the end. |
| auto wrapped = [bodyBuilder](OpBuilder &builder, Location loc, Value iv, |
| MutableArrayRef<Value> reduc) { |
| bodyBuilder(builder, loc, iv, reduc.drop_back()); |
| // Increments the counter. |
| reduc.back() = ADDI(reduc.back(), C_IDX(1)); |
| }; |
| |
| // FIXME: Need special handling when the previous unresolved slice is strided: |
| // We probably need to filter out coordinates that is not on stride. |
| if (firstResLvl.has_value()) { |
| // Overwrite position when the first level is fully resolved. |
| pos = posits[firstResLvl->first][firstResLvl->second]; |
| ip = builder.saveInsertionPoint(); |
| } else { |
| const SliceInfo &frontSlice = *unResLvls.back(); |
| Level firstLvl = *frontSlice.slicedOnLvl; |
| if (!lvlFullyResolved(tid, firstLvl)) { |
| if (isCompressedLT(lvlTypes[tid][firstLvl])) { |
| // An extra counter that tracks how many segments are there in the child |
| // compressed level. |
| innerArgs.push_back(c0); |
| // Overrides the user-provided builder. |
| bodyBuilder = wrapped; |
| unsigned depth = frontSlice.depth - 1; |
| Value offset = frontSlice.offset; |
| Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth]; |
| Value mSz = frontSlice.posTupleNum; |
| outerMost = builder.create<scf::ForOp>( |
| loc, c0, mSz, c1, innerArgs, |
| [this, tid, firstLvl, offset, sPtrBuf, &ip, &pos, |
| &innerArgs](OpBuilder &builder, Location loc, Value iv, |
| ValueRange iterArgs) { |
| // generate traversal for each level. |
| Value loopLo = |
| loadSlicePos(builder, loc, sPtrBuf, iv, SlicePosKind::kLo); |
| Value loopHi = |
| loadSlicePos(builder, loc, sPtrBuf, iv, SlicePosKind::kHi); |
| // We need to remember the starting index for next level's |
| // position, because slice-driven loop breaks the level into |
| // non-consecutive segments. |
| updateSlicePos(builder, loc, sPtrBuf, iterArgs.back(), iv, |
| SlicePosKind::kNext); |
| |
| auto [size, stride] = sliceMeta[tid][firstLvl].back(); |
| assert(stride == 1 && "Not yet implemented"); |
| ValueRange itArgs = |
| genSliceLvlTraverseLoop( |
| builder, loc, loopLo, loopHi, offset, size, tid, firstLvl, |
| iterArgs, |
| [&](OpBuilder &builder, Location, Value iv, |
| MutableArrayRef<Value> reduc) { |
| ip = builder.saveInsertionPoint(); |
| pos = iv; |
| innerArgs.assign(reduc.begin(), reduc.end()); |
| }) |
| .second; |
| YIELD(itArgs); |
| }); |
| } else if (isDenseLT(lvlTypes[tid][firstLvl])) { |
| assert(firstLvl == 0); // This must be the first level. |
| Value lb = frontSlice.offset; |
| auto [sliceSz, stride] = |
| sliceMeta[tid][*frontSlice.slicedOnLvl][frontSlice.depth]; |
| assert(stride == 1 && "Not yet implemented"); |
| Value ub = ADDI(lb, sliceSz); |
| outerMost = builder.create<scf::ForOp>( |
| loc, lb, ub, c1, innerArgs, |
| [&](OpBuilder &builder, Location loc, Value iv, |
| ValueRange iterArgs) { |
| ip = builder.saveInsertionPoint(); |
| pos = iv; |
| innerArgs.assign(iterArgs.begin(), iterArgs.end()); |
| }); |
| } |
| // We generated the loop for the first slice above, now remove it. |
| unResLvls = unResLvls.drop_back(); |
| } |
| } |
| // Reset the insertion point into the loop body. |
| builder.restoreInsertionPoint(ip); |
| if (!unResLvls.empty()) { |
| // Fills in dense slices levels in between. |
| SmallVector<Value> lbs, ubs, steps, lvlSzs; |
| for (const SliceInfo *slice : llvm::reverse(unResLvls)) { |
| Level sliceLvl = *slice->slicedOnLvl; |
| assert(isDenseLT(lvlTypes[tid][sliceLvl])); |
| Value offset = slice->offset; |
| auto [sliceSz, stride] = sliceMeta[tid][sliceLvl][slice->depth]; |
| assert(stride == 1 && "Not yet implemented"); |
| lbs.push_back(offset); |
| ubs.push_back(ADDI(offset, sliceSz)); |
| steps.push_back(c1); |
| lvlSzs.push_back(lvlSizes[tid][sliceLvl]); |
| } |
| auto denseNest = |
| scf::buildLoopNest(builder, loc, lbs, ubs, steps, innerArgs, |
| [&innerArgs, &lvlSzs, &pos, bodyBuilder]( |
| OpBuilder &builder, Location loc, ValueRange ivs, |
| ValueRange iterArgs) -> scf::ValueVector { |
| for (auto em : llvm::enumerate(ivs)) { |
| // Linearizes position: pos = (pos * lvlsize) + |
| // iv; |
| pos = MULI(pos, lvlSzs[em.index()]); |
| pos = ADDI(pos, em.value()); |
| } |
| innerArgs.assign(iterArgs.begin(), iterArgs.end()); |
| // Generates user request loop body. |
| bodyBuilder(builder, loc, pos, innerArgs); |
| return innerArgs; |
| }); |
| |
| if (!outerMost) { |
| // If the outermost loop has not been set, this is the outermost loop. |
| outerMost = denseNest.loops.front(); |
| } else { |
| // Otherwise we need to generate yield operations to link the SSA chain. |
| YIELD(denseNest.results); |
| } |
| } else { |
| assert(outerMost); |
| // Generates user request loop body. |
| bodyBuilder(builder, loc, pos, innerArgs); |
| YIELD(innerArgs); |
| } |
| assert(outerMost); |
| // Insert after current while operation. |
| builder.setInsertionPointAfter(outerMost); |
| return outerMost.getResults(); |
| } |
| |
| void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc, |
| TensorId tid, Level lvl) { |
| Value c0 = C_IDX(0), c1 = C_IDX(1); |
| if (isDenseLT(lvlTypes[tid][lvl])) { |
| // Dense slice begin is trivial. |
| sliceStack[tid].emplace_back(/*minCoord=*/c0, /*offset=*/c0, |
| /*nonEmpty=*/constantI1(builder, loc, true), |
| c0, lvl, /*depth=*/1); |
| return; |
| } |
| auto [nxSz, stride] = sliceMeta[tid][lvl][1]; |
| assert(stride == 1 && "Not yet implemented"); |
| Value sPtrBuf = slicePosBuffer[tid][lvl][0]; |
| Value pHi, pLo; |
| if (lvl == 0) { |
| pLo = c0; |
| pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1); |
| } else { |
| pLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], |
| posits[tid][lvl - 1]); |
| pHi = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], |
| ADDI(posits[tid][lvl - 1], c1)); |
| } |
| // Fills out pIdxBuffer[tid][lvl][0] with [pLo, pHi] |
| updateSlicePos(builder, loc, sPtrBuf, pLo, c0, SlicePosKind::kLo); |
| updateSlicePos(builder, loc, sPtrBuf, pHi, c0, SlicePosKind::kHi); |
| // Slice over a resolved parent, we only need one pair of pos hi and lo to |
| // specify the current slice. |
| Value tupleNum = c1; |
| // This is an non empty tensor if pLo < pHi. |
| Value isNonEmpty = CMPI(ult, pLo, pHi); |
| // The minimal coord must be at the first on ordered level. |
| // FIXME: Technically we should load the coord only when the slice is |
| // nonempty. though we assume that even on empty sparse tensors, a non-empty |
| // ptr/idx buffer is allocated for each level so it would not cause OOB to |
| // avoid generating a ifOp here. |
| Value minCrd = genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo); |
| |
| // FIXME: We need the relative offset related to the base slice. |
| Value absOffset = offsetFromMinCoord(builder, loc, minCrd, nxSz, isNonEmpty); |
| sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, tupleNum, lvl, |
| /*depth=*/1); |
| } |
| |
| // Fills in the slicePosBuffer before slice-driven loop begin. |
| // TODO: it can only handle all compressed tensors. |
| // |
| // // Loop generated by `genUnResolvedSliceTreeTraverse` |
| // for(int i = 0; i < slicePos.size(); i+=2) { |
| // loopLo = slicePos[i]; |
| // loopHi = slicePos[i + 1]; |
| // minCrd = max; |
| // while (loopLo < loopHi) { |
| // if (pos[loopLo] < sliceHi) { |
| // // bodyBuilder |
| // slicePos[tid].push_back(pos[loopLo]); |
| // slicePos[tid].push_back(pos[loopLo + 1]); |
| // minCrd = min(minCrd, crd[pos[loopLo]]); |
| // } else { |
| // break; |
| // } |
| // loopLo ++; |
| // } |
| // } |
| void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc, |
| TensorId tid, Level lvl) { |
| Value c0 = C_IDX(0), c1 = C_IDX(1); |
| unsigned depth = levelReducedDep[tid][lvl]; |
| // The remaining slice size after reduction. |
| Value remSz = sliceMeta[tid][lvl][depth + 1].first; |
| // Dense slice begin is trivial |
| if (isDenseLT(lvlTypes[tid][lvl])) { |
| sliceStack[tid].emplace_back(c0, c0, constantI1(builder, loc, false), c0, |
| lvl, depth + 1); |
| return; |
| } |
| |
| assert(isCompressedLT(lvlTypes[tid][lvl])); |
| // Unhandled Cases: |
| // |
| // 1st, lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one |
| // variable need to be reduced on the same level). |
| // |
| // 2nd, lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a |
| // simple dim expression in between). |
| assert(lvl == *sliceStack[tid].back().slicedOnLvl + 1); |
| |
| // Check slice stack integrity. |
| assert(slicePosBuffer[tid][lvl - 1].size() == sliceStack[tid].back().depth); |
| |
| SmallVector<const SliceInfo *> unResSlices; |
| std::optional<std::pair<TensorId, Level>> firstResLvl; |
| for (Level curLvl = lvl; curLvl >= 1; curLvl--) { |
| Level prevLvl = curLvl - 1; |
| if (lvlFullyResolved(tid, prevLvl)) { |
| firstResLvl = std::make_pair(tid, prevLvl); |
| break; |
| } |
| unResSlices.push_back(&getMostRecentSliceOnLvl(tid, prevLvl)); |
| if (!isDenseLT(lvlTypes[tid][prevLvl])) { |
| break; |
| } |
| } |
| |
| assert(!unResSlices.empty() && |
| !lvlFullyResolved(tid, *unResSlices.front()->slicedOnLvl)); |
| |
| Value sPtrBuf = slicePosBuffer[tid][lvl].back(); |
| SmallVector<Value, 3> reduc = { |
| constantI1(builder, loc, false), // isNonEmpty |
| lvlSizes[tid][lvl], // minCoord |
| c0, // memSize |
| }; |
| |
| ValueRange result = genUnResolvedSliceTreeTraverse( |
| builder, loc, tid, unResSlices, firstResLvl, reduc, |
| [this, c1, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc, Value iv, |
| MutableArrayRef<Value> reduc) { |
| Value &nonEmpty = reduc[0]; |
| Value &minCrd = reduc[1]; |
| Value &curTupleCnt = reduc[2]; |
| |
| Value pHi = ADDI(iv, c1); |
| Value sPLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], iv); |
| Value sPHi = |
| genIndexLoad(builder, loc, positionsBuffers[tid][lvl], pHi); |
| |
| // isNonEmpty = isNonEmpty || lvlNonEmpty, i.e., as long as there is |
| // one non-empty lvl, the slice is non-empty. |
| Value lvlNonEmpty = CMPI(ult, sPLo, sPHi); |
| nonEmpty = builder.create<arith::OrIOp>(loc, lvlNonEmpty, nonEmpty); |
| |
| // Update the minimum coordinate. |
| auto ifNonEmpty = builder.create<scf::IfOp>(loc, builder.getIndexType(), |
| lvlNonEmpty, true); |
| { |
| // Generate Code as follows. |
| // |
| // if (nonEmpty) { |
| // minCrd = min(minCrd, crd[pos[pLo]]); |
| // } |
| OpBuilder::InsertionGuard guard(builder); |
| builder.setInsertionPointToStart(ifNonEmpty.thenBlock()); |
| Value curC = |
| genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], sPLo); |
| Value isSmaller = CMPI(ult, curC, minCrd); |
| Value newMin = SELECT(isSmaller, curC, minCrd); |
| YIELD(newMin); |
| builder.setInsertionPointToStart(ifNonEmpty.elseBlock()); |
| YIELD(minCrd); |
| } |
| minCrd = ifNonEmpty.getResult(0); |
| updateSlicePos(builder, loc, sPtrBuf, sPLo, curTupleCnt, |
| SlicePosKind::kLo); |
| updateSlicePos(builder, loc, sPtrBuf, sPHi, curTupleCnt, |
| SlicePosKind::kHi); |
| curTupleCnt = ADDI(curTupleCnt, C_IDX(1)); |
| }); |
| |
| Value isNonEmpty = result[0]; |
| Value minCrd = result[1]; |
| // Two metadata [memSize, idx]. |
| // FIXME: we need the relative offset related to the base slice. |
| Value absOffset = offsetFromMinCoord(builder, loc, minCrd, remSz, isNonEmpty); |
| sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, result[2], lvl, |
| depth + 1); |
| } |
| |
| bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, |
| Level lvl) { |
| Value curLvlIdx = C_IDX(0); |
| if (depFullyReduced(tid, lvl)) { |
| if (lvl == 0 || trivialSlice[tid][lvl]) { |
| sliceTupleNxStartIdx[tid][lvl] = C_IDX(0); |
| } else { |
| if (isDenseLT(lvlTypes[tid][lvl])) { |
| sliceTupleNxStartIdx[tid][lvl] = sliceTupleNxStartIdx[tid][lvl - 1]; |
| } else { |
| assert(isCompressedLT(lvlTypes[tid][lvl])); |
| curLvlIdx = ADDI(sliceTupleNxStartIdx[tid][lvl - 1], |
| sliceTupleFwdCnt[0][lvl - 1]); |
| sliceTupleNxStartIdx[tid][lvl] = |
| loadSlicePos(builder, loc, slicePosBuffer[tid][lvl].back(), |
| curLvlIdx, SlicePosKind::kNext); |
| } |
| } |
| if (isDenseLT(lvlTypes[tid][lvl])) |
| return true; |
| |
| Value sPosBuf = slicePosBuffer[tid][lvl].back(); |
| // If constraints on the tensor is fully resolved. We do not need to |
| // generates slice begin any more, instead we fall back to TACO-based |
| // algorithm to (co)iterates over the slice. |
| Value tupleIdx = curLvlIdx; |
| posits[tid][lvl] = |
| loadSlicePos(builder, loc, sPosBuf, tupleIdx, SlicePosKind::kLo); |
| highs[tid][lvl] = |
| loadSlicePos(builder, loc, sPosBuf, tupleIdx, SlicePosKind::kHi); |
| return true; |
| } |
| |
| // Only when the level is sorted, the next-non-empty slice can be computed |
| // efficiently. |
| const LevelType lvlType = lvlTypes[tid][lvl]; |
| assert(isOrderedLT(lvlType)); |
| if (isSingletonLT(lvlType)) { |
| llvm_unreachable("TODO: dense level should be easy to support, while " |
| "singleton level requires more efforts"); |
| } |
| |
| assert(!dependentLvlMap[tid][lvl].empty()); |
| assert(!sliceStack[tid].empty()); |
| |
| const SliceInfo &sliceInfo = sliceStack[tid].back(); |
| auto baseEnc = getSparseTensorEncoding(tensors[tid].getType()); |
| if (baseEnc.isSlice()) |
| llvm_unreachable("TODO: not yet implemented"); |
| |
| // Generate caches required to fast compute next-non-empty slices with |
| // increasing offset for slice-base loop. |
| // We do not need cache for dense levels. |
| if (slicePosBuffer[tid][lvl][0] == nullptr && !isDenseLT(lvlType)) { |
| OpBuilder::InsertionGuard guard(builder); |
| // The buffer can be reused, and the size is loop invariant: it only |
| // depends on the iteration graph's toposort. |
| builder.setInsertionPointAfter(localInsertPos); |
| Value tupleCnt = C_IDX(1); |
| // Accumlates the size required to cache the pLo for the slice. |
| // E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the second |
| // level. We at most need to a memref<d0xindex>. |
| // NOTE: this is apperantly an over-approximation when the previous |
| // level is compressed, and we can compute a precise memory size |
| // inside the loops. But that would also requires us to allocate/free |
| // memorys in loops. |
| // TODO: Maybe using allocaScopeOp inside the loop to resolve the issue? |
| for (Level curLevel = lvl; |
| curLevel >= 1 && !lvlFullyResolved(tid, curLevel - 1); curLevel--) { |
| // We only handle cases when all the previously unresolved levels are |
| // fully reduced. |
| assert(depFullyReduced(tid, curLevel - 1)); |
| assert(!sliceMeta[tid][curLevel - 1].empty()); |
| auto [sz, stride] = sliceMeta[tid][curLevel - 1].back(); |
| assert(stride == 1 && "Not yet implemented"); |
| tupleCnt = MULI(tupleCnt, sz); |
| } |
| for (Value &cache : slicePosBuffer[tid][lvl]) |
| cache = allocSlicePosBuf(builder, loc, tupleCnt); |
| } |
| |
| if (sliceInfo.isInitialTensor() || |
| (lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) { |
| // First level or previous level has been full resolved. |
| trivialSlice[tid][lvl] = true; |
| genResolvedSliceBegin(builder, loc, tid, lvl); |
| } else { |
| // The previous level has not been full resolved. |
| trivialSlice[tid][lvl] = false; |
| genUnResolvedSliceBegin(builder, loc, tid, lvl); |
| } |
| return false; |
| } |
| |
| std::tuple<Value, Value, Value> |
| LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc, |
| TensorId tid, Level lvl) { |
| if (!isCompressedLT(lvlTypes[tid][lvl])) |
| llvm_unreachable("TODO"); |
| |
| // else generate code to compute next non empty slice. |
| Value c0 = C_IDX(0), c1 = C_IDX(1); |
| |
| SliceInfo &info = sliceStack[tid].back(); |
| assert(info.slicedOnLvl == lvl); |
| // |
| // We forward to the next non empty slice by |
| // if (minCrd > offset) { |
| // offset += 1 |
| // } else { |
| // minCrd = nextMinInSlice(); |
| // offset = minCrd - size + 1; |
| // } |
| // |
| // if (offset + size > parents.size) |
| // isNonEmpty = false; |
| // |
| Value absOffset = info.offset; |
| SmallVector<Value, 3> reduc = {info.minCrd, info.isNonEmpty, absOffset}; |
| Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1]; |
| Value fastPathP = CMPI(ugt, info.minCrd, absOffset); |
| auto ifOp = builder.create<scf::IfOp>(loc, ValueRange(reduc).getTypes(), |
| fastPathP, true); |
| { |
| OpBuilder::InsertionGuard guard(builder); |
| // Take the fast path |
| // if (minCrd > offset) { |
| // return offset += 1 |
| // } |
| builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); |
| reduc[2] = ADDI(absOffset, c1); |
| // Yield offset + 1. |
| YIELD(reduc); |
| |
| // else /*minCrd == offset*/ { |
| // for (i = 0; i < slicePos.size(); i+=kSliceIterWidth) { |
| // if (crd[pos[slicePos[i]]] == minCrd) { |
| // slicePos[i]++; |
| // } |
| // minCrd=min(minCrd, crd[pos[slicePos[i]]]); |
| // } |
| // offset = minCrd - size + 1; |
| // } |
| builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); |
| reduc[2] = absOffset; // restore value. |
| Value mSz = info.posTupleNum; // tuple number. |
| reduc[0] = lvlSizes[tid][lvl]; // next min coord |
| reduc[1] = constantI1(builder, loc, false); // isNonEmpty |
| auto loopArgs = static_cast<ValueRange>(reduc).drop_back(); |
| auto forOp = scf::buildLoopNest( |
| builder, loc, c0, mSz, c1, loopArgs, |
| [this, tid, lvl, c1, sPtrBuf, |
| &info](OpBuilder &builder, Location loc, ValueRange ivs, |
| ValueRange iterArgs) -> scf::ValueVector { |
| Value curMinCrd = iterArgs[0]; |
| Value isNonEmpty = iterArgs[1]; |
| |
| Type idxTp = builder.getIndexType(); |
| Value pLo = loadSlicePos(builder, loc, sPtrBuf, ivs.front(), |
| SlicePosKind::kLo); |
| Value pHi = loadSlicePos(builder, loc, sPtrBuf, ivs.front(), |
| SlicePosKind::kHi); |
| // |
| // if (pLo < pHi) // Only loads when inbound. |
| // coord = load[pLo] |
| // if coord == minCrd |
| // pLo += 1 |
| // |
| // if (pLo < pHi) |
| // curMinCrd = min(curMinCrd, load[pLo]) |
| // |
| Value pred = CMPI(ult, pLo, pHi); |
| auto advPLo = builder.create<scf::IfOp>(loc, idxTp, pred, true); |
| /* if pLo < pHi */ { |
| builder.setInsertionPointToStart(&advPLo.getThenRegion().front()); |
| // coord = load[pLo] |
| Value coord = |
| genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo); |
| Value pred = CMPI(eq, coord, info.minCrd); |
| auto ifEqual = builder.create<scf::IfOp>(loc, idxTp, pred, true); |
| /* if coord == minCrd */ { |
| builder.setInsertionPointToStart( |
| &ifEqual.getThenRegion().front()); |
| Value newPlo = ADDI(pLo, c1); |
| // Updates the cache. |
| updateSlicePos(builder, loc, sPtrBuf, newPlo, ivs.front(), |
| SlicePosKind::kLo); |
| YIELD(newPlo); |
| } |
| /* else coord != minCrd */ { |
| builder.setInsertionPointToStart( |
| &ifEqual.getElseRegion().front()); |
| YIELD(pLo); |
| } |
| builder.setInsertionPointAfter(ifEqual); |
| YIELD(ifEqual.getResults()); |
| } |
| /* else pLo >= pHi */ { |
| builder.setInsertionPointToStart(&advPLo.getElseRegion().front()); |
| YIELD(pLo); |
| } |
| |
| builder.setInsertionPointAfter(advPLo); |
| pLo = advPLo.getResult(0); |
| Value lvlNonEmpty = CMPI(ult, pLo, pHi); |
| // Update minCrds |
| auto newMin = |
| builder.create<scf::IfOp>(loc, idxTp, lvlNonEmpty, true); |
| builder.setInsertionPointToStart(&newMin.getThenRegion().front()); |
| YIELD(genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo)); |
| |
| builder.setInsertionPointToStart(&newMin.getElseRegion().front()); |
| YIELD(curMinCrd); |
| builder.setInsertionPointAfter(newMin); |
| |
| // isNonEmpty = isNonEmpty || lvlNonEmpty |
| isNonEmpty = |
| builder.create<arith::OrIOp>(loc, lvlNonEmpty, isNonEmpty); |
| curMinCrd = builder.create<arith::SelectOp>( |
| loc, CMPI(ult, newMin.getResult(0), curMinCrd), |
| newMin.getResult(0), curMinCrd); |
| return {curMinCrd, isNonEmpty}; |
| }); |
| |
| builder.setInsertionPointAfter(forOp.loops.front()); |
| // minOffset = minCrd + 1 >= size ? minCrd + 1 - size : c0 |
| Value tmp = ADDI(forOp.results.front(), c1); |
| auto [size, stride] = sliceMeta[tid][lvl][info.depth]; |
| assert(stride == 1 && "Not yet implemented"); |
| Value minOffset = SUBI(tmp, size); |
| Value p = CMPI(uge, tmp, size); |
| minOffset = SELECT(p, minOffset, c0); |
| |
| SmallVector<Value, 3> yields; |
| yields.assign(forOp.results.begin(), forOp.results.end()); |
| yields.push_back(minOffset); |
| YIELD(yields); |
| } |
| |
| Value nextMinCrd = ifOp.getResults()[0]; |
| Value nextNonEmpty = ifOp.getResults()[1]; |
| |
| // The next offset should at least be offset + 1; |
| Value minOffset = ifOp.getResults()[2]; |
| Value nxOffset = ADDI(info.offset, c1); |
| Value maxPred = CMPI(ugt, minOffset, nxOffset); |
| Value nextAbsOffset = SELECT(maxPred, minOffset, nxOffset); |
| |
| auto [size, stride] = sliceMeta[tid][lvl][info.depth]; |
| assert(stride == 1 && "Not yet implemented"); |
| Value sliceUB = ADDI(nextAbsOffset, size); |
| |
| // FIXME: this only works if there is only one parent. |
| assert(info.depth - 1 == 0); |
| // nextNonEmpty = nextNonEmpty && slice upper bound <= parent upperbound. |
| nextNonEmpty = ANDI(nextNonEmpty, CMPI(ule, sliceUB, lvlSizes[tid][lvl])); |
| |
| // FIXME: compute relative offset. |
| assert(info.depth - 1 == 0); |
| return std::make_tuple(nextNonEmpty, nextMinCrd, nextAbsOffset); |
| } |
| |
| #undef CMPI |
| #undef C_IDX |
| #undef YIELD |
| #undef ADDI |
| #undef ANDI |
| #undef SUBI |
| #undef MULI |
| #undef SELECT |