| //===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" |
| |
| #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" |
| #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" |
| #include "mlir/Conversion/LLVMCommon/TypeConverter.h" |
| #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/Func/IR/FuncOps.h" |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| #include "mlir/Dialect/GPU/TransformOps/Utils.h" |
| #include "mlir/Dialect/GPU/Transforms/Passes.h" |
| #include "mlir/Dialect/LLVMIR/NVVMDialect.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" |
| #include "mlir/Dialect/SCF/IR/SCF.h" |
| #include "mlir/Dialect/Transform/IR/TransformDialect.h" |
| #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" |
| #include "mlir/Dialect/Utils/IndexingUtils.h" |
| #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" |
| #include "mlir/IR/AffineExpr.h" |
| #include "mlir/IR/Builders.h" |
| #include "mlir/IR/BuiltinAttributes.h" |
| #include "mlir/IR/IRMapping.h" |
| #include "mlir/IR/MLIRContext.h" |
| #include "mlir/IR/OpDefinition.h" |
| #include "mlir/IR/Visitors.h" |
| #include "mlir/Support/LLVM.h" |
| #include "mlir/Transforms/DialectConversion.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/TypeSwitch.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include <type_traits> |
| |
| using namespace mlir; |
| using namespace mlir::gpu; |
| using namespace mlir::transform; |
| using namespace mlir::transform::gpu; |
| |
| #define DEBUG_TYPE "gpu-transforms" |
| #define DEBUG_TYPE_ALIAS "gpu-transforms-alias" |
| |
| #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ") |
| #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
| #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ") |
| |
| //===----------------------------------------------------------------------===// |
| // Apply...ConversionPatternsOp |
| //===----------------------------------------------------------------------===// |
| |
| void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns( |
| TypeConverter &typeConverter, RewritePatternSet &patterns) { |
| auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter); |
| // NVVM uses alloca in the default address space to represent private |
| // memory allocations, so drop private annotations. NVVM uses address |
| // space 3 for shared memory. NVVM uses the default address space to |
| // represent global memory. |
| // Used in populateGpuToNVVMConversionPatternsso attaching here for now. |
| // TODO: We should have a single to_nvvm_type_converter. |
| populateGpuMemorySpaceAttributeConversions( |
| llvmTypeConverter, [](AddressSpace space) -> unsigned { |
| switch (space) { |
| case AddressSpace::Global: |
| return static_cast<unsigned>( |
| NVVM::NVVMMemorySpace::kGlobalMemorySpace); |
| case AddressSpace::Workgroup: |
| return static_cast<unsigned>( |
| NVVM::NVVMMemorySpace::kSharedMemorySpace); |
| case AddressSpace::Private: |
| return 0; |
| } |
| llvm_unreachable("unknown address space enum value"); |
| return 0; |
| }); |
| // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now. |
| // TODO: We should have a single to_nvvm_type_converter. |
| llvmTypeConverter.addConversion( |
| [&](MMAMatrixType type) -> Type { return convertMMAToLLVMType(type); }); |
| populateGpuToNVVMConversionPatterns(llvmTypeConverter, patterns); |
| } |
| |
| LogicalResult |
| transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter( |
| transform::TypeConverterBuilderOpInterface builder) { |
| if (builder.getTypeConverterType() != "LLVMTypeConverter") |
| return emitOpError("expected LLVMTypeConverter"); |
| return success(); |
| } |
| |
| void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns( |
| TypeConverter &typeConverter, RewritePatternSet &patterns) { |
| auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter); |
| populateGpuWMMAToNVVMConversionPatterns(llvmTypeConverter, patterns); |
| } |
| |
| LogicalResult |
| transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter( |
| transform::TypeConverterBuilderOpInterface builder) { |
| if (builder.getTypeConverterType() != "LLVMTypeConverter") |
| return emitOpError("expected LLVMTypeConverter"); |
| return success(); |
| } |
| |
| void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp:: |
| populatePatterns(TypeConverter &typeConverter, |
| RewritePatternSet &patterns) { |
| auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter); |
| populateGpuSubgroupReduceOpLoweringPattern(llvmTypeConverter, patterns); |
| } |
| |
| LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp:: |
| verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) { |
| if (builder.getTypeConverterType() != "LLVMTypeConverter") |
| return emitOpError("expected LLVMTypeConverter"); |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Apply...PatternsOp |
| //===----------------------------------------------------------------------===//s |
| |
| void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) { |
| populateGpuRewritePatterns(patterns); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // ApplyUnrollVectorsSubgroupMmaOp |
| //===----------------------------------------------------------------------===// |
| |
| /// Pick an unrolling order that will allow tensorcore operation to reuse LHS |
| /// register. |
| static std::optional<SmallVector<int64_t>> |
| gpuMmaUnrollOrder(vector::ContractionOp contract) { |
| SmallVector<int64_t> order; |
| // First make reduction the outer dimensions. |
| for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) { |
| if (vector::isReductionIterator(iter)) { |
| order.push_back(index); |
| } |
| } |
| |
| llvm::SmallDenseSet<int64_t> dims; |
| for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) { |
| dims.insert(cast<AffineDimExpr>(expr).getPosition()); |
| } |
| // Then parallel dimensions that are part of Lhs as we want to re-use Lhs. |
| for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) { |
| if (vector::isParallelIterator(iter) && dims.count(index)) { |
| order.push_back(index); |
| } |
| } |
| // Then the remaining parallel loops. |
| for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) { |
| if (vector::isParallelIterator(iter) && !dims.count(index)) { |
| order.push_back(index); |
| } |
| } |
| return order; |
| } |
| |
| /// Returns the target vector size for the target operation based on the native |
| /// vector size specified with `m`, `n`, and `k`. |
| static std::optional<SmallVector<int64_t>> |
| getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) { |
| if (auto contract = dyn_cast<vector::ContractionOp>(op)) { |
| int64_t contractRank = contract.getIteratorTypes().size(); |
| if (contractRank < 3) |
| return std::nullopt; |
| SmallVector<int64_t> nativeSize(contractRank - 3, 1); |
| nativeSize.append({m, n, k}); |
| return nativeSize; |
| } |
| if (auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) { |
| int64_t writeRank = writeOp.getVectorType().getRank(); |
| if (writeRank < 2) |
| return std::nullopt; |
| SmallVector<int64_t> nativeSize(writeRank - 2, 1); |
| nativeSize.append({m, n}); |
| return nativeSize; |
| } |
| if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) { |
| // Transfer read ops may need different shapes based on how they are being |
| // used. For simplicity just match the shape used by the extract strided op. |
| VectorType sliceType; |
| for (Operation *users : op->getUsers()) { |
| auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users); |
| if (!extract) |
| return std::nullopt; |
| auto vecType = cast<VectorType>(extract.getResult().getType()); |
| if (sliceType && sliceType != vecType) |
| return std::nullopt; |
| sliceType = vecType; |
| } |
| return llvm::to_vector(sliceType.getShape()); |
| } |
| if ((OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)) { |
| if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) { |
| // TODO: The condition for unrolling elementwise should be restricted |
| // only to operations that need unrolling (connected to the contract). |
| if (vecType.getRank() < 2) |
| return std::nullopt; |
| |
| // First check whether there is a slice to infer the shape from. This is |
| // required for cases where the accumulator type differs from the input |
| // types, in which case we will see an `arith.ext_` between the contract |
| // and transfer_read which needs to be unrolled. |
| VectorType sliceType; |
| for (Operation *users : op->getUsers()) { |
| auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users); |
| if (!extract) |
| return std::nullopt; |
| auto vecType = cast<VectorType>(extract.getResult().getType()); |
| if (sliceType && sliceType != vecType) |
| return std::nullopt; |
| sliceType = vecType; |
| } |
| if (sliceType) |
| return llvm::to_vector(sliceType.getShape()); |
| |
| // Else unroll for trailing elementwise. |
| SmallVector<int64_t> nativeSize(vecType.getRank() - 2, 1); |
| // Map elementwise ops to the output shape. |
| nativeSize.append({m, n}); |
| return nativeSize; |
| } |
| } |
| return std::nullopt; |
| } |
| |
| void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns( |
| RewritePatternSet &patterns) { |
| auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> { |
| auto contract = dyn_cast<vector::ContractionOp>(op); |
| if (!contract) |
| return std::nullopt; |
| return gpuMmaUnrollOrder(contract); |
| }; |
| |
| int64_t m = getM(); |
| int64_t n = getN(); |
| int64_t k = getK(); |
| auto nativeShapeFn = |
| [m, n, k](Operation *op) -> std::optional<SmallVector<int64_t>> { |
| return getSubgroupMmaNativeVectorSize(op, m, n, k); |
| }; |
| vector::populateVectorUnrollPatterns( |
| patterns, vector::UnrollVectorOptions() |
| .setNativeShapeFn(nativeShapeFn) |
| .setUnrollTraversalOrderFn(unrollOrder)); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // EliminateBarriersOp |
| //===----------------------------------------------------------------------===// |
| |
| void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) { |
| populateGpuEliminateBarriersPatterns(patterns); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Block and thread mapping utilities. |
| //===----------------------------------------------------------------------===// |
| |
| namespace { |
| /// Local types used for mapping verification. |
| struct MappingKind {}; |
| struct BlockMappingKind : MappingKind {}; |
| struct ThreadMappingKind : MappingKind {}; |
| } // namespace |
| |
| static DiagnosedSilenceableFailure |
| definiteFailureHelper(std::optional<TransformOpInterface> transformOp, |
| Operation *target, const Twine &message) { |
| if (transformOp.has_value()) |
| return transformOp->emitDefiniteFailure() << message; |
| return emitDefiniteFailure(target, message); |
| } |
| |
| /// Check if given mapping attributes are one of the desired attributes |
| template <typename MappingKindType> |
| static DiagnosedSilenceableFailure |
| checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp, |
| scf::ForallOp forallOp) { |
| if (!forallOp.getMapping().has_value()) { |
| return definiteFailureHelper(transformOp, forallOp, |
| "scf.forall op requires a mapping attribute"); |
| } |
| |
| bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(), |
| llvm::IsaPred<GPUBlockMappingAttr>); |
| bool hasWarpgroupMapping = llvm::any_of( |
| forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>); |
| bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(), |
| llvm::IsaPred<GPUWarpMappingAttr>); |
| bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(), |
| llvm::IsaPred<GPUThreadMappingAttr>); |
| int64_t countMappingTypes = 0; |
| countMappingTypes += hasBlockMapping ? 1 : 0; |
| countMappingTypes += hasWarpgroupMapping ? 1 : 0; |
| countMappingTypes += hasWarpMapping ? 1 : 0; |
| countMappingTypes += hasThreadMapping ? 1 : 0; |
| if (countMappingTypes > 1) { |
| return definiteFailureHelper( |
| transformOp, forallOp, |
| "cannot mix different mapping types, use nesting"); |
| } |
| if (std::is_same<MappingKindType, BlockMappingKind>::value && |
| !hasBlockMapping) { |
| return definiteFailureHelper( |
| transformOp, forallOp, |
| "scf.forall op requires a mapping attribute of kind 'block'"); |
| } |
| if (std::is_same<MappingKindType, ThreadMappingKind>::value && |
| !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) { |
| return definiteFailureHelper(transformOp, forallOp, |
| "scf.forall op requires a mapping attribute " |
| "of kind 'thread' or 'warp'"); |
| } |
| |
| DenseSet<Attribute> seen; |
| for (Attribute map : forallOp.getMapping()->getValue()) { |
| if (seen.contains(map)) { |
| return definiteFailureHelper( |
| transformOp, forallOp, |
| "duplicate attribute, cannot map different loops " |
| "to the same mapping id"); |
| } |
| seen.insert(map); |
| } |
| |
| auto isLinear = [](Attribute a) { |
| return cast<DeviceMappingAttrInterface>(a).isLinearMapping(); |
| }; |
| if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) && |
| !llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) { |
| return definiteFailureHelper( |
| transformOp, forallOp, |
| "cannot mix linear and non-linear mapping modes"); |
| } |
| |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| template <typename MappingKindType> |
| static DiagnosedSilenceableFailure |
| verifyGpuMapping(std::optional<TransformOpInterface> transformOp, |
| scf::ForallOp forallOp) { |
| // Check the types of the mapping attributes match. |
| DiagnosedSilenceableFailure typeRes = |
| checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp); |
| if (!typeRes.succeeded()) |
| return typeRes; |
| |
| // Perform other non-types verifications. |
| if (!forallOp.isNormalized()) |
| return definiteFailureHelper(transformOp, forallOp, |
| "unsupported non-normalized loops"); |
| if (forallOp.getNumResults() > 0) |
| return definiteFailureHelper(transformOp, forallOp, |
| "only bufferized scf.forall can be mapped"); |
| bool useLinearMapping = cast<DeviceMappingAttrInterface>( |
| forallOp.getMapping()->getValue().front()) |
| .isLinearMapping(); |
| // TODO: This would be more natural with support for Optional<EnumParameter> |
| // in GPUDeviceMappingAttr. |
| int64_t maxNumMappingsSupported = |
| useLinearMapping ? (getMaxEnumValForMappingId() - |
| static_cast<uint64_t>(MappingId::DimZ)) |
| : 3; |
| if (forallOp.getRank() > maxNumMappingsSupported) { |
| return definiteFailureHelper(transformOp, forallOp, |
| "scf.forall with rank > ") |
| << maxNumMappingsSupported |
| << " does not lower for the specified mapping attribute type"; |
| } |
| auto numParallelIterations = |
| getConstantIntValues(forallOp.getMixedUpperBound()); |
| if (!forallOp.isNormalized() || !numParallelIterations.has_value()) { |
| return definiteFailureHelper( |
| transformOp, forallOp, |
| "requires statically sized, normalized forall op"); |
| } |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| /// Struct to return the result of the rewrite of a forall operation. |
| struct ForallRewriteResult { |
| SmallVector<int64_t> mappingSizes; |
| SmallVector<Value> mappingIds; |
| }; |
| |
| /// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR. |
| template <typename OpTy, typename OperationOrBlock> |
| static void |
| replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc, |
| OperationOrBlock *parent, Value replacement, |
| ArrayRef<int64_t> availableMappingSizes) { |
| parent->walk([&](OpTy idOp) { |
| if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1) |
| rewriter.replaceAllUsesWith(idOp.getResult(), replacement); |
| }); |
| } |
| |
| static DiagnosedSilenceableFailure rewriteOneForallCommonImpl( |
| RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp, |
| scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes, |
| ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) { |
| LDBG("--start rewriteOneForallCommonImpl"); |
| |
| // Step 1. Complete the mapping to a full mapping (with 1s) if necessary. |
| auto numParallelIterations = |
| getConstantIntValues(forallOp.getMixedUpperBound()); |
| assert(forallOp.isNormalized() && numParallelIterations.has_value() && |
| "requires statically sized, normalized forall op"); |
| SmallVector<int64_t> tmpMappingSizes = numParallelIterations.value(); |
| SetVector<Attribute> forallMappingAttrs; |
| forallMappingAttrs.insert(forallOp.getMapping()->getValue().begin(), |
| forallOp.getMapping()->getValue().end()); |
| auto comparator = [](Attribute a, Attribute b) -> bool { |
| return cast<DeviceMappingAttrInterface>(a).getMappingId() < |
| cast<DeviceMappingAttrInterface>(b).getMappingId(); |
| }; |
| |
| // Step 1.b. In the linear case, compute the max mapping to avoid needlessly |
| // mapping all dimensions. In the 3-D mapping case we need to map all |
| // dimensions. |
| DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>( |
| *llvm::max_element(forallMappingAttrs, comparator)); |
| DeviceMappingAttrInterface maxLinearMapping; |
| if (maxMapping.isLinearMapping()) |
| maxLinearMapping = maxMapping; |
| for (auto attr : gpuIdBuilder.mappingAttributes) { |
| // If attr overflows, just skip. |
| if (maxLinearMapping && comparator(maxLinearMapping, attr)) |
| continue; |
| // Try to insert. If element was already present, just continue. |
| if (!forallMappingAttrs.insert(attr)) |
| continue; |
| // Otherwise, we have a new insertion without a size -> use size 1. |
| tmpMappingSizes.push_back(1); |
| } |
| LLVM_DEBUG( |
| llvm::interleaveComma( |
| tmpMappingSizes, |
| DBGS() << "----tmpMappingSizes extracted from scf.forall op: "); |
| llvm::dbgs() << "\n"); |
| |
| // Step 2. sort the values by the corresponding DeviceMappingAttrInterface. |
| SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey( |
| forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator); |
| LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes, |
| DBGS() << "----forallMappingSizes: "); |
| llvm::dbgs() << "\n"; llvm::interleaveComma( |
| forallMappingAttrs, DBGS() << "----forallMappingAttrs: "); |
| llvm::dbgs() << "\n"); |
| |
| // Step 3. Generate the mappingIdOps using the provided generator. |
| Location loc = forallOp.getLoc(); |
| OpBuilder::InsertionGuard guard(rewriter); |
| rewriter.setInsertionPoint(forallOp); |
| SmallVector<int64_t> originalBasis(availableMappingSizes); |
| bool originalBasisWasProvided = !originalBasis.empty(); |
| if (!originalBasisWasProvided) { |
| originalBasis = forallMappingSizes; |
| while (originalBasis.size() < 3) |
| originalBasis.push_back(1); |
| } |
| |
| IdBuilderResult builderResult = |
| gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis); |
| |
| // Step 4. Map the induction variables to the mappingIdOps, this may involve |
| // a permutation. |
| SmallVector<Value> mappingIdOps = builderResult.mappingIdOps; |
| IRMapping bvm; |
| for (auto [iv, dim] : llvm::zip_equal( |
| forallOp.getInductionVars(), |
| forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) { |
| auto mappingAttr = cast<DeviceMappingAttrInterface>(dim); |
| Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()]; |
| bvm.map(iv, peIdOp); |
| } |
| |
| // Step 5. If the originalBasis is already known, create conditionals to |
| // predicate the region. Otherwise, the current forall determines the |
| // originalBasis and no predication occurs. |
| Value predicate; |
| if (originalBasisWasProvided) { |
| SmallVector<int64_t> activeMappingSizes = builderResult.activeMappingSizes; |
| SmallVector<int64_t> availableMappingSizes = |
| builderResult.availableMappingSizes; |
| SmallVector<Value> activeIdOps = builderResult.activeIdOps; |
| // clang-format off |
| LLVM_DEBUG( |
| llvm::interleaveComma( |
| activeMappingSizes, DBGS() << "----activeMappingSizes: "); |
| llvm::dbgs() << "\n"; |
| llvm::interleaveComma( |
| availableMappingSizes, DBGS() << "----availableMappingSizes: "); |
| llvm::dbgs() << "\n"; |
| llvm::interleaveComma(activeIdOps, DBGS() << "----activeIdOps: "); |
| llvm::dbgs() << "\n"); |
| // clang-format on |
| for (auto [activeId, activeMappingSize, availableMappingSize] : |
| llvm::zip_equal(activeIdOps, activeMappingSizes, |
| availableMappingSizes)) { |
| if (activeMappingSize > availableMappingSize) { |
| return definiteFailureHelper( |
| transformOp, forallOp, |
| "Trying to map to fewer GPU threads than loop iterations but " |
| "overprovisioning is not yet supported. " |
| "Try additional tiling of the before mapping or map to more " |
| "threads."); |
| } |
| if (activeMappingSize == availableMappingSize) |
| continue; |
| Value idx = |
| rewriter.create<arith::ConstantIndexOp>(loc, activeMappingSize); |
| Value tmpPredicate = rewriter.create<arith::CmpIOp>( |
| loc, arith::CmpIPredicate::ult, activeId, idx); |
| LDBG("----predicate: " << tmpPredicate); |
| predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate, |
| tmpPredicate) |
| : tmpPredicate; |
| } |
| } |
| |
| // Step 6. Move the body of forallOp. |
| // Erase the terminator first, it will not be used. |
| rewriter.eraseOp(forallOp.getTerminator()); |
| Block *targetBlock; |
| Block::iterator insertionPoint; |
| if (predicate) { |
| // Step 6.a. If predicated, move at the beginning. |
| auto ifOp = rewriter.create<scf::IfOp>(loc, predicate, |
| /*withElseRegion=*/false); |
| targetBlock = ifOp.thenBlock(); |
| insertionPoint = ifOp.thenBlock()->begin(); |
| } else { |
| // Step 6.b. Otherwise, move inline just at the rewriter insertion |
| // point. |
| targetBlock = forallOp->getBlock(); |
| insertionPoint = rewriter.getInsertionPoint(); |
| } |
| Block &sourceBlock = forallOp.getRegion().front(); |
| targetBlock->getOperations().splice(insertionPoint, |
| sourceBlock.getOperations()); |
| |
| // Step 7. RAUW indices. |
| for (Value loopIndex : forallOp.getInductionVars()) { |
| Value threadIdx = bvm.lookup(loopIndex); |
| rewriter.replaceAllUsesWith(loopIndex, threadIdx); |
| } |
| |
| // Step 8. Erase old op. |
| rewriter.eraseOp(forallOp); |
| |
| LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes, |
| DBGS() << "----result forallMappingSizes: "); |
| llvm::dbgs() << "\n"; llvm::interleaveComma( |
| mappingIdOps, DBGS() << "----result mappingIdOps: "); |
| llvm::dbgs() << "\n"); |
| |
| result = ForallRewriteResult{forallMappingSizes, mappingIdOps}; |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // MapForallToBlocks |
| //===----------------------------------------------------------------------===// |
| |
| DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl( |
| RewriterBase &rewriter, TransformOpInterface transformOp, |
| scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims, |
| const GpuIdBuilder &gpuIdBuilder) { |
| LDBG("Start mapForallToBlocksImpl"); |
| |
| { |
| // GPU-specific verifications. There is no better place to anchor |
| // those right now: the ForallOp is target-independent and the transform |
| // op does not apply to individual ForallOp. |
| DiagnosedSilenceableFailure diag = |
| verifyGpuMapping<BlockMappingKind>(transformOp, forallOp); |
| if (!diag.succeeded()) |
| return diag; |
| } |
| |
| Location loc = forallOp.getLoc(); |
| Block *parentBlock = forallOp->getBlock(); |
| Value zero; |
| { |
| // Create an early zero index value for replacements and immediately reset |
| // the insertion point. |
| OpBuilder::InsertionGuard guard(rewriter); |
| rewriter.setInsertionPointToStart(parentBlock); |
| zero = rewriter.create<arith::ConstantIndexOp>(loc, 0); |
| } |
| |
| ForallRewriteResult rewriteResult; |
| DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl( |
| rewriter, transformOp, forallOp, |
| /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder); |
| |
| // Return if anything goes wrong, use silenceable failure as a match |
| // failure. |
| if (!diag.succeeded()) |
| return diag; |
| |
| // If gridDims was not provided already, set it from the return. |
| if (gridDims.empty()) { |
| gridDims = rewriteResult.mappingSizes; |
| while (gridDims.size() < 3) |
| gridDims.push_back(1); |
| } |
| assert(gridDims.size() == 3 && "Need 3-D gridDims"); |
| |
| // Replace ids of dimensions known to be 1 by 0 to simplify the IR. |
| // Here, the result of mapping determines the available mapping sizes. |
| replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero, |
| rewriteResult.mappingSizes); |
| |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| DiagnosedSilenceableFailure |
| mlir::transform::gpu::findTopLevelForallOp(Operation *target, |
| scf::ForallOp &topLevelForallOp, |
| TransformOpInterface transformOp) { |
| auto walkResult = target->walk([&](scf::ForallOp forallOp) { |
| if (forallOp->getParentOfType<scf::ForallOp>()) |
| return WalkResult::advance(); |
| if (topLevelForallOp) |
| // TODO: Handle multiple forall if they are independent. |
| return WalkResult::interrupt(); |
| topLevelForallOp = forallOp; |
| return WalkResult::advance(); |
| }); |
| |
| if (walkResult.wasInterrupted() || !topLevelForallOp) |
| return transformOp.emitSilenceableError() |
| << "could not find a unique topLevel scf.forall"; |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne( |
| transform::TransformRewriter &rewriter, Operation *target, |
| ApplyToEachResultList &results, transform::TransformState &state) { |
| LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target); |
| auto transformOp = cast<TransformOpInterface>(getOperation()); |
| |
| if (!getGenerateGpuLaunch() && !gpuLaunch) { |
| DiagnosedSilenceableFailure diag = |
| emitSilenceableError() |
| << "Given target is not gpu.launch, set `generate_gpu_launch` " |
| "attribute"; |
| diag.attachNote(target->getLoc()) << "when applied to this payload op"; |
| return diag; |
| } |
| |
| scf::ForallOp topLevelForallOp; |
| DiagnosedSilenceableFailure diag = mlir::transform::gpu::findTopLevelForallOp( |
| target, topLevelForallOp, transformOp); |
| if (!diag.succeeded()) { |
| diag.attachNote(target->getLoc()) << "when applied to this payload op"; |
| return diag; |
| } |
| assert(topLevelForallOp && "expect an scf.forall"); |
| |
| SmallVector<int64_t> gridDims{getGridDims()}; |
| if (!getGenerateGpuLaunch() && gridDims.size() != 3) |
| return transformOp.emitDefiniteFailure("transform require size-3 mapping"); |
| |
| OpBuilder::InsertionGuard guard(rewriter); |
| rewriter.setInsertionPoint(topLevelForallOp); |
| |
| // Generate gpu launch here and move the forall inside |
| if (getGenerateGpuLaunch()) { |
| DiagnosedSilenceableFailure diag = |
| createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch); |
| if (!diag.succeeded()) |
| return diag; |
| |
| rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front()); |
| Operation *newForallOp = rewriter.clone(*topLevelForallOp); |
| rewriter.eraseOp(topLevelForallOp); |
| topLevelForallOp = cast<scf::ForallOp>(newForallOp); |
| } |
| |
| // The BlockIdBuilder adapts to whatever is thrown at it. |
| bool useLinearMapping = false; |
| if (topLevelForallOp.getMapping()) { |
| auto mappingAttr = cast<DeviceMappingAttrInterface>( |
| topLevelForallOp.getMapping()->getValue().front()); |
| useLinearMapping = mappingAttr.isLinearMapping(); |
| } |
| GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping); |
| |
| diag = mlir::transform::gpu::mapForallToBlocksImpl( |
| rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder); |
| if (!diag.succeeded()) |
| return diag; |
| |
| // Set the GPU launch configuration for the grid dims late, this is |
| // subject to IR inspection. |
| diag = alterGpuLaunch(rewriter, gpuLaunch, |
| cast<TransformOpInterface>(getOperation()), gridDims[0], |
| gridDims[1], gridDims[2]); |
| |
| results.push_back(gpuLaunch); |
| return diag; |
| } |
| |
| LogicalResult transform::MapForallToBlocks::verify() { |
| if (!getGridDims().empty() && getGridDims().size() != 3) { |
| return emitOpError() << "transform requires empty or size-3 grid_dims"; |
| } |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // MapNestedForallToThreads |
| //===----------------------------------------------------------------------===// |
| |
| static DiagnosedSilenceableFailure checkMappingSpec( |
| std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp, |
| ArrayRef<int64_t> numParallelIterations, ArrayRef<int64_t> blockOrGridSizes, |
| int factor, bool useLinearMapping = false) { |
| if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) { |
| auto diag = definiteFailureHelper( |
| transformOp, forallOp, |
| Twine("3-D mapping: size of threadIdx.x must be a multiple of ") + |
| std::to_string(factor)); |
| return diag; |
| } |
| if (computeProduct(numParallelIterations) * factor > |
| computeProduct(blockOrGridSizes)) { |
| auto diag = definiteFailureHelper( |
| transformOp, forallOp, |
| Twine("the number of required parallel resources (blocks or " |
| "threads) ") + |
| std::to_string(computeProduct(numParallelIterations) * factor) + |
| std::string(" overflows the number of available resources ") + |
| std::to_string(computeProduct(blockOrGridSizes))); |
| return diag; |
| } |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| static DiagnosedSilenceableFailure |
| getThreadIdBuilder(std::optional<TransformOpInterface> transformOp, |
| scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, |
| int64_t warpSize, GpuIdBuilder &gpuIdBuilder) { |
| auto mappingAttr = cast<DeviceMappingAttrInterface>( |
| forallOp.getMapping()->getValue().front()); |
| bool useLinearMapping = mappingAttr.isLinearMapping(); |
| |
| // Sanity checks that may result in runtime verification errors. |
| auto numParallelIterations = |
| getConstantIntValues((forallOp.getMixedUpperBound())); |
| if (!forallOp.isNormalized() || !numParallelIterations.has_value()) { |
| return definiteFailureHelper( |
| transformOp, forallOp, |
| "requires statically sized, normalized forall op"); |
| } |
| int64_t factor = 1; |
| if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) { |
| factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize; |
| } else if (isa<GPUWarpMappingAttr>(mappingAttr)) { |
| factor = warpSize; |
| } |
| DiagnosedSilenceableFailure diag = |
| checkMappingSpec(transformOp, forallOp, numParallelIterations.value(), |
| blockSizes, factor, useLinearMapping); |
| if (!diag.succeeded()) |
| return diag; |
| |
| // Start mapping. |
| MLIRContext *ctx = forallOp.getContext(); |
| gpuIdBuilder = |
| TypeSwitch<DeviceMappingAttrInterface, GpuIdBuilder>(mappingAttr) |
| .Case([&](GPUWarpgroupMappingAttr) { |
| return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping); |
| }) |
| .Case([&](GPUWarpMappingAttr) { |
| return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping); |
| }) |
| .Case([&](GPUThreadMappingAttr) { |
| return GpuThreadIdBuilder(ctx, useLinearMapping); |
| }) |
| .Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder { |
| llvm_unreachable("unknown mapping attribute"); |
| }); |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl( |
| RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp, |
| scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize, |
| bool syncAfterDistribute) { |
| |
| { |
| // GPU-specific verifications. There is no better place to anchor |
| // those right now: the ForallOp is target-independent and the transform |
| // op does not apply to individual ForallOp. |
| DiagnosedSilenceableFailure diag = |
| verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp); |
| if (!diag.succeeded()) |
| return diag; |
| } |
| |
| GpuIdBuilder gpuIdBuilder; |
| { |
| // Try to construct the id builder, if it fails, return. |
| DiagnosedSilenceableFailure diag = getThreadIdBuilder( |
| transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder); |
| if (!diag.succeeded()) |
| return diag; |
| } |
| |
| Location loc = forallOp.getLoc(); |
| OpBuilder::InsertionGuard g(rewriter); |
| // Insert after to allow for syncthreads after `forall` is erased. |
| rewriter.setInsertionPointAfter(forallOp); |
| ForallRewriteResult rewriteResult; |
| DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl( |
| rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder); |
| if (!diag.succeeded()) |
| return diag; |
| // Add a syncthreads if needed. TODO: warpsync |
| if (syncAfterDistribute) |
| rewriter.create<BarrierOp>(loc); |
| |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl( |
| RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp, |
| Operation *target, ArrayRef<int64_t> blockDims, int64_t warpSize, |
| bool syncAfterDistribute) { |
| LDBG("Start mapNestedForallToThreadsImpl"); |
| if (blockDims.size() != 3) { |
| return definiteFailureHelper(transformOp, target, |
| "requires size-3 thread mapping"); |
| } |
| |
| // Create an early zero index value for replacements. |
| Location loc = target->getLoc(); |
| Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0); |
| DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success(); |
| WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) { |
| diag = mlir::transform::gpu::mapOneForallToThreadsImpl( |
| rewriter, transformOp, forallOp, blockDims, warpSize, |
| syncAfterDistribute); |
| if (diag.isDefiniteFailure()) |
| return WalkResult::interrupt(); |
| if (diag.succeeded()) |
| return WalkResult::skip(); |
| return WalkResult::advance(); |
| }); |
| if (walkResult.wasInterrupted()) |
| return diag; |
| |
| // Replace ids of dimensions known to be 1 by 0 to simplify the IR. |
| // Here, the result of mapping determines the available mapping sizes. |
| replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero, |
| blockDims); |
| |
| return DiagnosedSilenceableFailure::success(); |
| } |
| |
| DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne( |
| transform::TransformRewriter &rewriter, Operation *target, |
| ApplyToEachResultList &results, TransformState &state) { |
| LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target); |
| auto transformOp = cast<TransformOpInterface>(getOperation()); |
| |
| // Basic high-level verifications. |
| if (!gpuLaunch) |
| return emitSilenceableError() << "Given target is not a gpu.launch"; |
| |
| // Mapping to block ids. |
| SmallVector<int64_t> blockDims{getBlockDims()}; |
| DiagnosedSilenceableFailure diag = |
| checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt, |
| blockDims[0], blockDims[1], blockDims[2]); |
| if (diag.isSilenceableFailure()) { |
| diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large"; |
| return diag; |
| } |
| |
| // Set the GPU launch configuration for the block dims early, this is not |
| // subject to IR inspection. |
| diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt, |
| std::nullopt, std::nullopt, blockDims[0], blockDims[1], |
| blockDims[2]); |
| |
| rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front()); |
| diag = |
| mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims, |
| getWarpSize(), getSyncAfterDistribute()); |
| |
| results.push_back(gpuLaunch.getOperation()); |
| return diag; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Transform op registration |
| //===----------------------------------------------------------------------===// |
| |
| namespace { |
| /// Registers new ops and declares PDL as dependent dialect since the |
| /// additional ops are using PDL types for operands and results. |
| class GPUTransformDialectExtension |
| : public transform::TransformDialectExtension< |
| GPUTransformDialectExtension> { |
| public: |
| GPUTransformDialectExtension() { |
| declareGeneratedDialect<scf::SCFDialect>(); |
| declareGeneratedDialect<arith::ArithDialect>(); |
| declareGeneratedDialect<GPUDialect>(); |
| registerTransformOps< |
| #define GET_OP_LIST |
| #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc" |
| >(); |
| } |
| }; |
| } // namespace |
| |
| #define GET_OP_CLASSES |
| #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc" |
| |
| void mlir::gpu::registerTransformDialectExtension(DialectRegistry ®istry) { |
| registry.addExtensions<GPUTransformDialectExtension>(); |
| } |