| //===- GPUDialect.cpp - MLIR Dialect for GPU Kernels implementation -------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements the GPU kernel-related dialect and its operations. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| #include "mlir/IR/Attributes.h" |
| #include "mlir/IR/Builders.h" |
| #include "mlir/IR/BuiltinAttributes.h" |
| #include "mlir/IR/BuiltinOps.h" |
| #include "mlir/IR/BuiltinTypes.h" |
| #include "mlir/IR/Diagnostics.h" |
| #include "mlir/IR/DialectImplementation.h" |
| #include "mlir/IR/Matchers.h" |
| #include "mlir/IR/OpImplementation.h" |
| #include "mlir/IR/PatternMatch.h" |
| #include "mlir/IR/SymbolTable.h" |
| #include "mlir/IR/TypeUtilities.h" |
| #include "mlir/Interfaces/FunctionImplementation.h" |
| #include "mlir/Interfaces/SideEffectInterfaces.h" |
| #include "mlir/Support/LogicalResult.h" |
| #include "mlir/Transforms/InliningUtils.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/ADT/TypeSwitch.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/ErrorHandling.h" |
| #include "llvm/Support/StringSaver.h" |
| #include <cassert> |
| |
| using namespace mlir; |
| using namespace mlir::gpu; |
| |
| #include "mlir/Dialect/GPU/IR/GPUOpsDialect.cpp.inc" |
| |
| //===----------------------------------------------------------------------===// |
| // GPU Device Mapping Attributes |
| //===----------------------------------------------------------------------===// |
| |
| int64_t GPUBlockMappingAttr::getMappingId() const { |
| return static_cast<int64_t>(getBlock()); |
| } |
| |
| bool GPUBlockMappingAttr::isLinearMapping() const { |
| return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0); |
| } |
| |
| int64_t GPUBlockMappingAttr::getRelativeIndex() const { |
| return isLinearMapping() |
| ? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0) |
| : getMappingId(); |
| } |
| |
| int64_t GPUWarpgroupMappingAttr::getMappingId() const { |
| return static_cast<int64_t>(getWarpgroup()); |
| } |
| |
| bool GPUWarpgroupMappingAttr::isLinearMapping() const { |
| return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0); |
| } |
| |
| int64_t GPUWarpgroupMappingAttr::getRelativeIndex() const { |
| return isLinearMapping() |
| ? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0) |
| : getMappingId(); |
| } |
| |
| int64_t GPUWarpMappingAttr::getMappingId() const { |
| return static_cast<int64_t>(getWarp()); |
| } |
| |
| bool GPUWarpMappingAttr::isLinearMapping() const { |
| return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0); |
| } |
| |
| int64_t GPUWarpMappingAttr::getRelativeIndex() const { |
| return isLinearMapping() |
| ? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0) |
| : getMappingId(); |
| } |
| |
| int64_t GPUThreadMappingAttr::getMappingId() const { |
| return static_cast<int64_t>(getThread()); |
| } |
| |
| bool GPUThreadMappingAttr::isLinearMapping() const { |
| return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0); |
| } |
| |
| int64_t GPUThreadMappingAttr::getRelativeIndex() const { |
| return isLinearMapping() |
| ? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0) |
| : getMappingId(); |
| } |
| |
| int64_t GPUMemorySpaceMappingAttr::getMappingId() const { |
| return static_cast<int64_t>(getAddressSpace()); |
| } |
| |
| bool GPUMemorySpaceMappingAttr::isLinearMapping() const { |
| llvm_unreachable("GPUMemorySpaceMappingAttr does not support linear mapping"); |
| } |
| |
| int64_t GPUMemorySpaceMappingAttr::getRelativeIndex() const { |
| llvm_unreachable("GPUMemorySpaceMappingAttr does not support relative index"); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // MMAMatrixType |
| //===----------------------------------------------------------------------===// |
| |
| MMAMatrixType MMAMatrixType::get(ArrayRef<int64_t> shape, Type elementType, |
| StringRef operand) { |
| return Base::get(elementType.getContext(), shape, elementType, operand); |
| } |
| |
| MMAMatrixType |
| MMAMatrixType::getChecked(function_ref<InFlightDiagnostic()> emitError, |
| ArrayRef<int64_t> shape, Type elementType, |
| StringRef operand) { |
| return Base::getChecked(emitError, elementType.getContext(), shape, |
| elementType, operand); |
| } |
| |
| unsigned MMAMatrixType::getNumDims() const { return getImpl()->numDims; } |
| |
| ArrayRef<int64_t> MMAMatrixType::getShape() const { |
| return getImpl()->getShape(); |
| } |
| |
| Type MMAMatrixType::getElementType() const { return getImpl()->elementType; } |
| |
| StringRef MMAMatrixType::getOperand() const { return getImpl()->getOperand(); } |
| |
| bool MMAMatrixType::isValidElementType(Type elementType) { |
| return elementType.isF16() || elementType.isF32() || |
| elementType.isUnsignedInteger(8) || elementType.isSignedInteger(8) || |
| elementType.isInteger(32); |
| } |
| |
| LogicalResult |
| MMAMatrixType::verify(function_ref<InFlightDiagnostic()> emitError, |
| ArrayRef<int64_t> shape, Type elementType, |
| StringRef operand) { |
| if (!operand.equals("AOp") && !operand.equals("BOp") && |
| !operand.equals("COp")) |
| return emitError() << "operand expected to be one of AOp, BOp or COp"; |
| |
| if (shape.size() != 2) |
| return emitError() << "MMAMatrixType must have exactly two dimensions"; |
| |
| if (!MMAMatrixType::isValidElementType(elementType)) |
| return emitError() |
| << "MMAMatrixType elements must be SI8, UI8, I32, F16, or F32"; |
| |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPUDialect |
| //===----------------------------------------------------------------------===// |
| |
| bool GPUDialect::isWorkgroupMemoryAddressSpace(Attribute memorySpace) { |
| if (!memorySpace) |
| return false; |
| if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace)) |
| return gpuAttr.getValue() == getWorkgroupAddressSpace(); |
| return false; |
| } |
| |
| bool GPUDialect::hasWorkgroupMemoryAddressSpace(MemRefType type) { |
| Attribute memorySpace = type.getMemorySpace(); |
| return isWorkgroupMemoryAddressSpace(memorySpace); |
| } |
| |
| bool GPUDialect::isKernel(Operation *op) { |
| UnitAttr isKernelAttr = op->getAttrOfType<UnitAttr>(getKernelFuncAttrName()); |
| return static_cast<bool>(isKernelAttr); |
| } |
| |
| namespace { |
| /// This class defines the interface for handling inlining with gpu |
| /// operations. |
| struct GPUInlinerInterface : public DialectInlinerInterface { |
| using DialectInlinerInterface::DialectInlinerInterface; |
| |
| /// All gpu dialect ops can be inlined. |
| bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final { |
| return true; |
| } |
| }; |
| } // namespace |
| |
| void GPUDialect::initialize() { |
| addTypes<AsyncTokenType>(); |
| addTypes<MMAMatrixType>(); |
| addTypes<SparseDnTensorHandleType>(); |
| addTypes<SparseSpMatHandleType>(); |
| addTypes<SparseSpGEMMOpHandleType>(); |
| addOperations< |
| #define GET_OP_LIST |
| #include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc" |
| >(); |
| addAttributes< |
| #define GET_ATTRDEF_LIST |
| #include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc" |
| >(); |
| addInterfaces<GPUInlinerInterface>(); |
| } |
| |
| static std::string getSparseHandleKeyword(SparseHandleKind kind) { |
| switch (kind) { |
| case SparseHandleKind::DnTensor: |
| return "sparse.dntensor_handle"; |
| case SparseHandleKind::SpMat: |
| return "sparse.spmat_handle"; |
| case SparseHandleKind::SpGEMMOp: |
| return "sparse.spgemmop_handle"; |
| } |
| llvm_unreachable("unknown sparse handle kind"); |
| return ""; |
| } |
| |
| Type GPUDialect::parseType(DialectAsmParser &parser) const { |
| // Parse the main keyword for the type. |
| StringRef keyword; |
| if (parser.parseKeyword(&keyword)) |
| return Type(); |
| MLIRContext *context = getContext(); |
| |
| // Handle 'async token' types. |
| if (keyword == "async.token") |
| return AsyncTokenType::get(context); |
| |
| if (keyword == "mma_matrix") { |
| SMLoc beginLoc = parser.getNameLoc(); |
| |
| // Parse '<'. |
| if (parser.parseLess()) |
| return nullptr; |
| |
| // Parse the size and elementType. |
| SmallVector<int64_t> shape; |
| Type elementType; |
| if (parser.parseDimensionList(shape, /*allowDynamic=*/false) || |
| parser.parseType(elementType)) |
| return nullptr; |
| |
| // Parse ',' |
| if (parser.parseComma()) |
| return nullptr; |
| |
| // Parse operand. |
| std::string operand; |
| if (failed(parser.parseOptionalString(&operand))) |
| return nullptr; |
| |
| // Parse '>'. |
| if (parser.parseGreater()) |
| return nullptr; |
| |
| return MMAMatrixType::getChecked(mlir::detail::getDefaultDiagnosticEmitFn( |
| parser.getEncodedSourceLoc(beginLoc)), |
| shape, elementType, operand); |
| } |
| |
| if (keyword == getSparseHandleKeyword(SparseHandleKind::DnTensor)) |
| return SparseDnTensorHandleType::get(context); |
| if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat)) |
| return SparseSpMatHandleType::get(context); |
| if (keyword == getSparseHandleKeyword(SparseHandleKind::SpGEMMOp)) |
| return SparseSpGEMMOpHandleType::get(context); |
| |
| parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword); |
| return Type(); |
| } |
| // TODO: print refined type here. Notice that should be corresponding to the |
| // parser |
| void GPUDialect::printType(Type type, DialectAsmPrinter &os) const { |
| TypeSwitch<Type>(type) |
| .Case<AsyncTokenType>([&](Type) { os << "async.token"; }) |
| .Case<SparseDnTensorHandleType>([&](Type) { |
| os << getSparseHandleKeyword(SparseHandleKind::DnTensor); |
| }) |
| .Case<SparseSpMatHandleType>( |
| [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); }) |
| .Case<SparseSpGEMMOpHandleType>([&](Type) { |
| os << getSparseHandleKeyword(SparseHandleKind::SpGEMMOp); |
| }) |
| .Case<MMAMatrixType>([&](MMAMatrixType fragTy) { |
| os << "mma_matrix<"; |
| auto shape = fragTy.getShape(); |
| for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim) |
| os << *dim << 'x'; |
| os << shape.back() << 'x' << fragTy.getElementType(); |
| os << ", \"" << fragTy.getOperand() << "\"" << '>'; |
| }) |
| .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); }); |
| } |
| |
| LogicalResult GPUDialect::verifyOperationAttribute(Operation *op, |
| NamedAttribute attr) { |
| if (!llvm::isa<UnitAttr>(attr.getValue()) || |
| attr.getName() != getContainerModuleAttrName()) |
| return success(); |
| |
| auto module = dyn_cast<ModuleOp>(op); |
| if (!module) |
| return op->emitError("expected '") |
| << getContainerModuleAttrName() << "' attribute to be attached to '" |
| << ModuleOp::getOperationName() << '\''; |
| |
| auto walkResult = module.walk([&module](LaunchFuncOp launchOp) -> WalkResult { |
| // Ignore launches that are nested more or less deep than functions in the |
| // module we are currently checking. |
| if (!launchOp->getParentOp() || |
| launchOp->getParentOp()->getParentOp() != module) |
| return success(); |
| |
| // Ignore launch ops with missing attributes here. The errors will be |
| // reported by the verifiers of those ops. |
| if (!launchOp->getAttrOfType<SymbolRefAttr>( |
| LaunchFuncOp::getKernelAttrName(launchOp->getName()))) |
| return success(); |
| |
| // Check that `launch_func` refers to a well-formed GPU kernel container. |
| StringAttr kernelContainerName = launchOp.getKernelModuleName(); |
| Operation *kernelContainer = module.lookupSymbol(kernelContainerName); |
| if (!kernelContainer) |
| return launchOp.emitOpError() |
| << "kernel container '" << kernelContainerName.getValue() |
| << "' is undefined"; |
| |
| // If the container is a GPU binary op return success. |
| if (isa<BinaryOp>(kernelContainer)) |
| return success(); |
| |
| auto kernelModule = dyn_cast<GPUModuleOp>(kernelContainer); |
| if (!kernelModule) |
| return launchOp.emitOpError() |
| << "kernel module '" << kernelContainerName.getValue() |
| << "' is undefined"; |
| |
| // Check that `launch_func` refers to a well-formed kernel function. |
| Operation *kernelFunc = module.lookupSymbol(launchOp.getKernelAttr()); |
| if (!kernelFunc) |
| return launchOp.emitOpError("kernel function '") |
| << launchOp.getKernel() << "' is undefined"; |
| auto kernelConvertedFunction = dyn_cast<FunctionOpInterface>(kernelFunc); |
| if (!kernelConvertedFunction) { |
| InFlightDiagnostic diag = launchOp.emitOpError() |
| << "referenced kernel '" << launchOp.getKernel() |
| << "' is not a function"; |
| diag.attachNote(kernelFunc->getLoc()) << "see the kernel definition here"; |
| return diag; |
| } |
| |
| if (!kernelFunc->getAttrOfType<mlir::UnitAttr>( |
| GPUDialect::getKernelFuncAttrName())) |
| return launchOp.emitOpError("kernel function is missing the '") |
| << GPUDialect::getKernelFuncAttrName() << "' attribute"; |
| |
| // TODO: If the kernel isn't a GPU function (which happens during separate |
| // compilation), do not check type correspondence as it would require the |
| // verifier to be aware of the type conversion. |
| auto kernelGPUFunction = dyn_cast<gpu::GPUFuncOp>(kernelFunc); |
| if (!kernelGPUFunction) |
| return success(); |
| |
| unsigned actualNumArguments = launchOp.getNumKernelOperands(); |
| unsigned expectedNumArguments = kernelGPUFunction.getNumArguments(); |
| if (expectedNumArguments != actualNumArguments) |
| return launchOp.emitOpError("got ") |
| << actualNumArguments << " kernel operands but expected " |
| << expectedNumArguments; |
| |
| auto functionType = kernelGPUFunction.getFunctionType(); |
| for (unsigned i = 0; i < expectedNumArguments; ++i) { |
| if (launchOp.getKernelOperand(i).getType() != functionType.getInput(i)) { |
| return launchOp.emitOpError("type of function argument ") |
| << i << " does not match"; |
| } |
| } |
| |
| return success(); |
| }); |
| |
| return walkResult.wasInterrupted() ? failure() : success(); |
| } |
| |
| /// Parses an optional list of async operands with an optional leading keyword. |
| /// (`async`)? (`[` ssa-id-list `]`)? |
| /// |
| /// This method is used by the tablegen assembly format for async ops as well. |
| static ParseResult parseAsyncDependencies( |
| OpAsmParser &parser, Type &asyncTokenType, |
| SmallVectorImpl<OpAsmParser::UnresolvedOperand> &asyncDependencies) { |
| auto loc = parser.getCurrentLocation(); |
| if (succeeded(parser.parseOptionalKeyword("async"))) { |
| if (parser.getNumResults() == 0) |
| return parser.emitError(loc, "needs to be named when marked 'async'"); |
| asyncTokenType = parser.getBuilder().getType<AsyncTokenType>(); |
| } |
| return parser.parseOperandList(asyncDependencies, |
| OpAsmParser::Delimiter::OptionalSquare); |
| } |
| |
| /// Prints optional async dependencies with its leading keyword. |
| /// (`async`)? (`[` ssa-id-list `]`)? |
| // Used by the tablegen assembly format for several async ops. |
| static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op, |
| Type asyncTokenType, |
| OperandRange asyncDependencies) { |
| if (asyncTokenType) |
| printer << "async"; |
| if (asyncDependencies.empty()) |
| return; |
| if (asyncTokenType) |
| printer << ' '; |
| printer << '['; |
| llvm::interleaveComma(asyncDependencies, printer); |
| printer << ']'; |
| } |
| |
| // GPU Memory attributions functions shared by LaunchOp and GPUFuncOp. |
| /// Parses a GPU function memory attribution. |
| /// |
| /// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? |
| /// (`private` `(` ssa-id-and-type-list `)`)? |
| /// |
| /// Note that this function parses only one of the two similar parts, with the |
| /// keyword provided as argument. |
| static ParseResult |
| parseAttributions(OpAsmParser &parser, StringRef keyword, |
| SmallVectorImpl<OpAsmParser::Argument> &args) { |
| // If we could not parse the keyword, just assume empty list and succeed. |
| if (failed(parser.parseOptionalKeyword(keyword))) |
| return success(); |
| |
| return parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren, |
| /*allowType=*/true); |
| } |
| |
| /// Prints a GPU function memory attribution. |
| static void printAttributions(OpAsmPrinter &p, StringRef keyword, |
| ArrayRef<BlockArgument> values) { |
| if (values.empty()) |
| return; |
| |
| p << ' ' << keyword << '('; |
| llvm::interleaveComma( |
| values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); }); |
| p << ')'; |
| } |
| |
| /// Verifies a GPU function memory attribution. |
| static LogicalResult verifyAttributions(Operation *op, |
| ArrayRef<BlockArgument> attributions, |
| gpu::AddressSpace memorySpace) { |
| for (Value v : attributions) { |
| auto type = llvm::dyn_cast<MemRefType>(v.getType()); |
| if (!type) |
| return op->emitOpError() << "expected memref type in attribution"; |
| |
| // We can only verify the address space if it hasn't already been lowered |
| // from the AddressSpaceAttr to a target-specific numeric value. |
| auto addressSpace = |
| llvm::dyn_cast_or_null<gpu::AddressSpaceAttr>(type.getMemorySpace()); |
| if (!addressSpace) |
| continue; |
| if (addressSpace.getValue() != memorySpace) |
| return op->emitOpError() |
| << "expected memory space " << stringifyAddressSpace(memorySpace) |
| << " in attribution"; |
| } |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AllReduceOp |
| //===----------------------------------------------------------------------===// |
| |
| static LogicalResult verifyReduceOpAndType(gpu::AllReduceOperation opName, |
| Type resType) { |
| using Kind = gpu::AllReduceOperation; |
| if (llvm::is_contained( |
| {Kind::MINNUMF, Kind::MAXNUMF, Kind::MINIMUMF, Kind::MAXIMUMF}, |
| opName)) { |
| if (!isa<FloatType>(resType)) |
| return failure(); |
| } |
| |
| if (llvm::is_contained({Kind::MINSI, Kind::MINUI, Kind::MAXSI, Kind::MAXUI, |
| Kind::AND, Kind::OR, Kind::XOR}, |
| opName)) { |
| if (!isa<IntegerType>(resType)) |
| return failure(); |
| } |
| |
| return success(); |
| } |
| |
| LogicalResult gpu::AllReduceOp::verifyRegions() { |
| if (getBody().empty() != getOp().has_value()) |
| return emitError("expected either an op attribute or a non-empty body"); |
| if (!getBody().empty()) { |
| if (getBody().getNumArguments() != 2) |
| return emitError("expected two region arguments"); |
| for (auto argument : getBody().getArguments()) { |
| if (argument.getType() != getType()) |
| return emitError("incorrect region argument type"); |
| } |
| unsigned yieldCount = 0; |
| for (Block &block : getBody()) { |
| if (auto yield = dyn_cast<gpu::YieldOp>(block.getTerminator())) { |
| if (yield.getNumOperands() != 1) |
| return emitError("expected one gpu.yield operand"); |
| if (yield.getOperand(0).getType() != getType()) |
| return emitError("incorrect gpu.yield type"); |
| ++yieldCount; |
| } |
| } |
| if (yieldCount == 0) |
| return emitError("expected gpu.yield op in region"); |
| } else { |
| gpu::AllReduceOperation opName = *getOp(); |
| if (failed(verifyReduceOpAndType(opName, getType()))) { |
| return emitError() << '`' << gpu::stringifyAllReduceOperation(opName) |
| << "` reduction operation is not compatible with type " |
| << getType(); |
| } |
| } |
| |
| return success(); |
| } |
| |
| static bool canMakeGroupOpUniform(Operation *op) { |
| auto launchOp = dyn_cast<gpu::LaunchOp>(op->getParentOp()); |
| if (!launchOp) |
| return false; |
| |
| Region &body = launchOp.getBody(); |
| assert(!body.empty() && "Invalid region"); |
| |
| // Only convert ops in gpu::launch entry block for now. |
| return op->getBlock() == &body.front(); |
| } |
| |
| OpFoldResult gpu::AllReduceOp::fold(FoldAdaptor /*adaptor*/) { |
| if (!getUniform() && canMakeGroupOpUniform(*this)) { |
| setUniform(true); |
| return getResult(); |
| } |
| |
| return nullptr; |
| } |
| |
| // TODO: Support optional custom attributes (without dialect prefix). |
| static ParseResult parseAllReduceOperation(AsmParser &parser, |
| AllReduceOperationAttr &attr) { |
| StringRef enumStr; |
| if (!parser.parseOptionalKeyword(&enumStr)) { |
| std::optional<AllReduceOperation> op = |
| gpu::symbolizeAllReduceOperation(enumStr); |
| if (!op) |
| return parser.emitError(parser.getCurrentLocation(), "invalid op kind"); |
| attr = AllReduceOperationAttr::get(parser.getContext(), *op); |
| } |
| return success(); |
| } |
| |
| static void printAllReduceOperation(AsmPrinter &printer, Operation *op, |
| AllReduceOperationAttr attr) { |
| if (attr) |
| attr.print(printer); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // SubgroupReduceOp |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult gpu::SubgroupReduceOp::verify() { |
| Type elemType = getType(); |
| if (auto vecTy = dyn_cast<VectorType>(elemType)) { |
| if (vecTy.isScalable()) |
| return emitOpError() << "is not compatible with scalable vector types"; |
| |
| elemType = vecTy.getElementType(); |
| } |
| |
| gpu::AllReduceOperation opName = getOp(); |
| if (failed(verifyReduceOpAndType(opName, elemType))) { |
| return emitError() << '`' << gpu::stringifyAllReduceOperation(opName) |
| << "` reduction operation is not compatible with type " |
| << getType(); |
| } |
| return success(); |
| } |
| |
| OpFoldResult gpu::SubgroupReduceOp::fold(FoldAdaptor /*adaptor*/) { |
| if (!getUniform() && canMakeGroupOpUniform(*this)) { |
| setUniform(true); |
| return getResult(); |
| } |
| |
| return nullptr; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // AsyncOpInterface |
| //===----------------------------------------------------------------------===// |
| |
| void gpu::addAsyncDependency(Operation *op, Value token) { |
| op->insertOperands(0, {token}); |
| if (!op->template hasTrait<OpTrait::AttrSizedOperandSegments>()) |
| return; |
| auto attrName = |
| OpTrait::AttrSizedOperandSegments<void>::getOperandSegmentSizeAttr(); |
| auto sizeAttr = op->template getAttrOfType<DenseI32ArrayAttr>(attrName); |
| |
| // Async dependencies is the only variadic operand. |
| if (!sizeAttr) |
| return; |
| |
| SmallVector<int32_t, 8> sizes(sizeAttr.asArrayRef()); |
| ++sizes.front(); |
| op->setAttr(attrName, Builder(op->getContext()).getDenseI32ArrayAttr(sizes)); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // LaunchOp |
| //===----------------------------------------------------------------------===// |
| |
| void LaunchOp::build(OpBuilder &builder, OperationState &result, |
| Value gridSizeX, Value gridSizeY, Value gridSizeZ, |
| Value getBlockSizeX, Value getBlockSizeY, |
| Value getBlockSizeZ, Value dynamicSharedMemorySize, |
| Type asyncTokenType, ValueRange asyncDependencies, |
| TypeRange workgroupAttributions, |
| TypeRange privateAttributions) { |
| // Add a WorkGroup attribution attribute. This attribute is required to |
| // identify private attributions in the list of block argguments. |
| result.addAttribute(getNumWorkgroupAttributionsAttrName(), |
| builder.getI64IntegerAttr(workgroupAttributions.size())); |
| |
| // Add Op operands. |
| result.addOperands(asyncDependencies); |
| if (asyncTokenType) |
| result.types.push_back(builder.getType<AsyncTokenType>()); |
| |
| // Add grid and block sizes as op operands, followed by the data operands. |
| result.addOperands({gridSizeX, gridSizeY, gridSizeZ, getBlockSizeX, |
| getBlockSizeY, getBlockSizeZ}); |
| if (dynamicSharedMemorySize) |
| result.addOperands(dynamicSharedMemorySize); |
| |
| // Create a kernel body region with kNumConfigRegionAttributes + N memory |
| // attributions, where the first kNumConfigRegionAttributes arguments have |
| // `index` type and the rest have the same types as the data operands. |
| Region *kernelRegion = result.addRegion(); |
| Block *body = new Block(); |
| // TODO: Allow passing in proper locations here. |
| for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i) |
| body->addArgument(builder.getIndexType(), result.location); |
| // Add WorkGroup & Private attributions to the region arguments. |
| for (Type argTy : workgroupAttributions) |
| body->addArgument(argTy, result.location); |
| for (Type argTy : privateAttributions) |
| body->addArgument(argTy, result.location); |
| kernelRegion->push_back(body); |
| // Fill OperandSegmentSize Attribute. |
| SmallVector<int32_t, 8> segmentSizes(8, 1); |
| segmentSizes.front() = asyncDependencies.size(); |
| segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0; |
| result.addAttribute(getOperandSegmentSizeAttr(), |
| builder.getDenseI32ArrayAttr(segmentSizes)); |
| } |
| |
| KernelDim3 LaunchOp::getBlockIds() { |
| assert(!getBody().empty() && "LaunchOp body must not be empty."); |
| auto args = getBody().getArguments(); |
| return KernelDim3{args[0], args[1], args[2]}; |
| } |
| |
| KernelDim3 LaunchOp::getThreadIds() { |
| assert(!getBody().empty() && "LaunchOp body must not be empty."); |
| auto args = getBody().getArguments(); |
| return KernelDim3{args[3], args[4], args[5]}; |
| } |
| |
| KernelDim3 LaunchOp::getGridSize() { |
| assert(!getBody().empty() && "LaunchOp body must not be empty."); |
| auto args = getBody().getArguments(); |
| return KernelDim3{args[6], args[7], args[8]}; |
| } |
| |
| KernelDim3 LaunchOp::getBlockSize() { |
| assert(!getBody().empty() && "LaunchOp body must not be empty."); |
| auto args = getBody().getArguments(); |
| return KernelDim3{args[9], args[10], args[11]}; |
| } |
| |
| KernelDim3 LaunchOp::getGridSizeOperandValues() { |
| auto operands = getOperands().drop_front(getAsyncDependencies().size()); |
| return KernelDim3{operands[0], operands[1], operands[2]}; |
| } |
| |
| KernelDim3 LaunchOp::getBlockSizeOperandValues() { |
| auto operands = getOperands().drop_front(getAsyncDependencies().size()); |
| return KernelDim3{operands[3], operands[4], operands[5]}; |
| } |
| |
| LogicalResult LaunchOp::verifyRegions() { |
| // Kernel launch takes kNumConfigOperands leading operands for grid/block |
| // sizes and transforms them into kNumConfigRegionAttributes region arguments |
| // for block/thread identifiers and grid/block sizes. |
| if (!getBody().empty()) { |
| if (getBody().getNumArguments() < |
| kNumConfigRegionAttributes + getNumWorkgroupAttributions()) |
| return emitOpError("unexpected number of region arguments"); |
| } |
| |
| // Verify Attributions Address Spaces. |
| if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(), |
| GPUDialect::getWorkgroupAddressSpace())) || |
| failed(verifyAttributions(getOperation(), getPrivateAttributions(), |
| GPUDialect::getPrivateAddressSpace()))) |
| return failure(); |
| |
| // Block terminators without successors are expected to exit the kernel region |
| // and must be `gpu.terminator`. |
| for (Block &block : getBody()) { |
| if (block.empty()) |
| continue; |
| if (block.back().getNumSuccessors() != 0) |
| continue; |
| if (!isa<gpu::TerminatorOp>(&block.back())) { |
| return block.back() |
| .emitError() |
| .append("expected '", gpu::TerminatorOp::getOperationName(), |
| "' or a terminator with successors") |
| .attachNote(getLoc()) |
| .append("in '", LaunchOp::getOperationName(), "' body region"); |
| } |
| } |
| |
| if (getNumResults() == 0 && getAsyncToken()) |
| return emitOpError("needs to be named when async keyword is specified"); |
| |
| return success(); |
| } |
| |
| // Pretty-print the kernel grid/block size assignment as |
| // (%iter-x, %iter-y, %iter-z) in |
| // (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use) |
| // where %size-* and %iter-* will correspond to the body region arguments. |
| static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size, |
| KernelDim3 operands, KernelDim3 ids) { |
| p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in ("; |
| p << size.x << " = " << operands.x << ", "; |
| p << size.y << " = " << operands.y << ", "; |
| p << size.z << " = " << operands.z << ')'; |
| } |
| |
| void LaunchOp::print(OpAsmPrinter &p) { |
| if (getAsyncToken()) { |
| p << " async"; |
| if (!getAsyncDependencies().empty()) |
| p << " [" << getAsyncDependencies() << ']'; |
| } |
| // Print the launch configuration. |
| p << ' ' << getBlocksKeyword(); |
| printSizeAssignment(p, getGridSize(), getGridSizeOperandValues(), |
| getBlockIds()); |
| p << ' ' << getThreadsKeyword(); |
| printSizeAssignment(p, getBlockSize(), getBlockSizeOperandValues(), |
| getThreadIds()); |
| if (getDynamicSharedMemorySize()) |
| p << ' ' << getDynamicSharedMemorySizeKeyword() << ' ' |
| << getDynamicSharedMemorySize(); |
| |
| printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions()); |
| printAttributions(p, getPrivateKeyword(), getPrivateAttributions()); |
| |
| p << ' '; |
| |
| p.printRegion(getBody(), /*printEntryBlockArgs=*/false); |
| p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{ |
| LaunchOp::getOperandSegmentSizeAttr(), |
| getNumWorkgroupAttributionsAttrName()}); |
| } |
| |
| // Parse the size assignment blocks for blocks and threads. These have the form |
| // (%region_arg, %region_arg, %region_arg) in |
| // (%region_arg = %operand, %region_arg = %operand, %region_arg = %operand) |
| // where %region_arg are percent-identifiers for the region arguments to be |
| // introduced further (SSA defs), and %operand are percent-identifiers for the |
| // SSA value uses. |
| static ParseResult |
| parseSizeAssignment(OpAsmParser &parser, |
| MutableArrayRef<OpAsmParser::UnresolvedOperand> sizes, |
| MutableArrayRef<OpAsmParser::UnresolvedOperand> regionSizes, |
| MutableArrayRef<OpAsmParser::UnresolvedOperand> indices) { |
| assert(indices.size() == 3 && "space for three indices expected"); |
| SmallVector<OpAsmParser::UnresolvedOperand, 3> args; |
| if (parser.parseOperandList(args, OpAsmParser::Delimiter::Paren, |
| /*allowResultNumber=*/false) || |
| parser.parseKeyword("in") || parser.parseLParen()) |
| return failure(); |
| std::move(args.begin(), args.end(), indices.begin()); |
| |
| for (int i = 0; i < 3; ++i) { |
| if (i != 0 && parser.parseComma()) |
| return failure(); |
| if (parser.parseOperand(regionSizes[i], /*allowResultNumber=*/false) || |
| parser.parseEqual() || parser.parseOperand(sizes[i])) |
| return failure(); |
| } |
| |
| return parser.parseRParen(); |
| } |
| |
| /// Parses a Launch operation. |
| /// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)? |
| /// `blocks` `(` ssa-id-list `)` `in` ssa-reassignment |
| /// `threads` `(` ssa-id-list `)` `in` ssa-reassignment |
| /// memory-attribution |
| /// region attr-dict? |
| /// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` |
| ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { |
| // Sizes of the grid and block. |
| SmallVector<OpAsmParser::UnresolvedOperand, LaunchOp::kNumConfigOperands> |
| sizes(LaunchOp::kNumConfigOperands); |
| MutableArrayRef<OpAsmParser::UnresolvedOperand> sizesRef(sizes); |
| |
| // Actual (data) operands passed to the kernel. |
| SmallVector<OpAsmParser::UnresolvedOperand, 4> dataOperands; |
| |
| // Region arguments to be created. |
| SmallVector<OpAsmParser::UnresolvedOperand, 16> regionArgs( |
| LaunchOp::kNumConfigRegionAttributes); |
| MutableArrayRef<OpAsmParser::UnresolvedOperand> regionArgsRef(regionArgs); |
| |
| // Parse optional async dependencies. |
| SmallVector<OpAsmParser::UnresolvedOperand, 4> asyncDependencies; |
| Type asyncTokenType; |
| if (failed( |
| parseAsyncDependencies(parser, asyncTokenType, asyncDependencies)) || |
| parser.resolveOperands(asyncDependencies, asyncTokenType, |
| result.operands)) |
| return failure(); |
| if (parser.getNumResults() > 0) |
| result.types.push_back(asyncTokenType); |
| |
| // Parse the size assignment segments: the first segment assigns grid sizes |
| // and defines values for block identifiers; the second segment assigns block |
| // sizes and defines values for thread identifiers. In the region argument |
| // list, identifiers precede sizes, and block-related values precede |
| // thread-related values. |
| if (parser.parseKeyword(LaunchOp::getBlocksKeyword().data()) || |
| parseSizeAssignment(parser, sizesRef.take_front(3), |
| regionArgsRef.slice(6, 3), |
| regionArgsRef.slice(0, 3)) || |
| parser.parseKeyword(LaunchOp::getThreadsKeyword().data()) || |
| parseSizeAssignment(parser, sizesRef.drop_front(3), |
| regionArgsRef.slice(9, 3), |
| regionArgsRef.slice(3, 3)) || |
| parser.resolveOperands(sizes, parser.getBuilder().getIndexType(), |
| result.operands)) |
| return failure(); |
| |
| OpAsmParser::UnresolvedOperand dynamicSharedMemorySize; |
| bool hasDynamicSharedMemorySize = false; |
| if (!parser.parseOptionalKeyword( |
| LaunchOp::getDynamicSharedMemorySizeKeyword())) { |
| hasDynamicSharedMemorySize = true; |
| if (parser.parseOperand(dynamicSharedMemorySize) || |
| parser.resolveOperand(dynamicSharedMemorySize, |
| parser.getBuilder().getI32Type(), |
| result.operands)) |
| return failure(); |
| } |
| |
| // Create the region arguments, it has kNumConfigRegionAttributes arguments |
| // that correspond to block/thread identifiers and grid/block sizes, all |
| // having `index` type, a variadic number of WorkGroup Attributions and |
| // a variadic number of Private Attributions. The number of WorkGroup |
| // Attributions is stored in the attr with name: |
| // LaunchOp::getNumWorkgroupAttributionsAttrName(). |
| Type index = parser.getBuilder().getIndexType(); |
| SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes( |
| LaunchOp::kNumConfigRegionAttributes, index); |
| |
| SmallVector<OpAsmParser::Argument> regionArguments; |
| for (auto ssaValueAndType : llvm::zip(regionArgs, dataTypes)) { |
| OpAsmParser::Argument arg; |
| arg.ssaName = std::get<0>(ssaValueAndType); |
| arg.type = std::get<1>(ssaValueAndType); |
| regionArguments.push_back(arg); |
| } |
| |
| Builder &builder = parser.getBuilder(); |
| // Parse workgroup memory attributions. |
| if (failed(parseAttributions(parser, LaunchOp::getWorkgroupKeyword(), |
| regionArguments))) |
| return failure(); |
| |
| // Store the number of operands we just parsed as the number of workgroup |
| // memory attributions. |
| unsigned numWorkgroupAttrs = |
| regionArguments.size() - LaunchOp::kNumConfigRegionAttributes; |
| result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(), |
| builder.getI64IntegerAttr(numWorkgroupAttrs)); |
| |
| // Parse private memory attributions. |
| if (failed(parseAttributions(parser, LaunchOp::getPrivateKeyword(), |
| regionArguments))) |
| return failure(); |
| |
| // Introduce the body region and parse it. The region has |
| // kNumConfigRegionAttributes arguments that correspond to |
| // block/thread identifiers and grid/block sizes, all having `index` type. |
| Region *body = result.addRegion(); |
| if (parser.parseRegion(*body, regionArguments) || |
| parser.parseOptionalAttrDict(result.attributes)) |
| return failure(); |
| |
| SmallVector<int32_t, 8> segmentSizes(8, 1); |
| segmentSizes.front() = asyncDependencies.size(); |
| segmentSizes.back() = hasDynamicSharedMemorySize ? 1 : 0; |
| result.addAttribute(LaunchOp::getOperandSegmentSizeAttr(), |
| parser.getBuilder().getDenseI32ArrayAttr(segmentSizes)); |
| return success(); |
| } |
| |
| /// Simplify the gpu.launch when the range of a thread or block ID is |
| /// trivially known to be one. |
| struct FoldLaunchArguments : public OpRewritePattern<LaunchOp> { |
| using OpRewritePattern<LaunchOp>::OpRewritePattern; |
| LogicalResult matchAndRewrite(LaunchOp op, |
| PatternRewriter &rewriter) const override { |
| // If the range implies a single value for `id`, replace `id`'s uses by |
| // zero. |
| Value zero; |
| bool simplified = false; |
| auto constPropIdUses = [&](Value id, Value size) { |
| // Check if size is trivially one. |
| if (!matchPattern(size, m_One())) |
| return; |
| if (id.getUses().empty()) |
| return; |
| if (!simplified) { |
| // Create a zero value the first time. |
| OpBuilder::InsertionGuard guard(rewriter); |
| rewriter.setInsertionPointToStart(&op.getBody().front()); |
| zero = |
| rewriter.create<arith::ConstantIndexOp>(op.getLoc(), /*value=*/0); |
| } |
| rewriter.replaceAllUsesWith(id, zero); |
| simplified = true; |
| }; |
| constPropIdUses(op.getBlockIds().x, op.getGridSizeX()); |
| constPropIdUses(op.getBlockIds().y, op.getGridSizeY()); |
| constPropIdUses(op.getBlockIds().z, op.getGridSizeZ()); |
| constPropIdUses(op.getThreadIds().x, op.getBlockSizeX()); |
| constPropIdUses(op.getThreadIds().y, op.getBlockSizeY()); |
| constPropIdUses(op.getThreadIds().z, op.getBlockSizeZ()); |
| |
| return success(simplified); |
| } |
| }; |
| |
| void LaunchOp::getCanonicalizationPatterns(RewritePatternSet &rewrites, |
| MLIRContext *context) { |
| rewrites.add<FoldLaunchArguments>(context); |
| } |
| |
| /// Adds a new block argument that corresponds to buffers located in |
| /// workgroup memory. |
| BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) { |
| auto attrName = getNumWorkgroupAttributionsAttrName(); |
| auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName); |
| (*this)->setAttr(attrName, |
| IntegerAttr::get(attr.getType(), attr.getValue() + 1)); |
| return getBody().insertArgument( |
| LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc); |
| } |
| |
| /// Adds a new block argument that corresponds to buffers located in |
| /// private memory. |
| BlockArgument LaunchOp::addPrivateAttribution(Type type, Location loc) { |
| // Buffers on the private memory always come after buffers on the workgroup |
| // memory. |
| return getBody().addArgument(type, loc); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // LaunchFuncOp |
| //===----------------------------------------------------------------------===// |
| |
| void LaunchFuncOp::build(OpBuilder &builder, OperationState &result, |
| GPUFuncOp kernelFunc, KernelDim3 gridSize, |
| KernelDim3 getBlockSize, Value dynamicSharedMemorySize, |
| ValueRange kernelOperands, Type asyncTokenType, |
| ValueRange asyncDependencies, |
| std::optional<KernelDim3> clusterSize) { |
| result.addOperands(asyncDependencies); |
| if (asyncTokenType) |
| result.types.push_back(builder.getType<AsyncTokenType>()); |
| |
| // Add grid and block sizes as op operands, followed by the data operands. |
| result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x, |
| getBlockSize.y, getBlockSize.z}); |
| if (clusterSize.has_value()) |
| result.addOperands({clusterSize->x, clusterSize->y, clusterSize->z}); |
| if (dynamicSharedMemorySize) |
| result.addOperands(dynamicSharedMemorySize); |
| result.addOperands(kernelOperands); |
| auto kernelModule = kernelFunc->getParentOfType<GPUModuleOp>(); |
| auto kernelSymbol = |
| SymbolRefAttr::get(kernelModule.getNameAttr(), |
| {SymbolRefAttr::get(kernelFunc.getNameAttr())}); |
| |
| Properties &prop = result.getOrAddProperties<Properties>(); |
| prop.kernel = kernelSymbol; |
| size_t segmentSizesLen = std::size(prop.operandSegmentSizes); |
| // Initialize the segment sizes to 1. |
| for (auto &sz : prop.operandSegmentSizes) |
| sz = 1; |
| prop.operandSegmentSizes[0] = asyncDependencies.size(); |
| if (!clusterSize.has_value()) { |
| prop.operandSegmentSizes[segmentSizesLen - 4] = 0; |
| prop.operandSegmentSizes[segmentSizesLen - 5] = 0; |
| prop.operandSegmentSizes[segmentSizesLen - 6] = 0; |
| } |
| prop.operandSegmentSizes[segmentSizesLen - 3] = |
| dynamicSharedMemorySize ? 1 : 0; |
| prop.operandSegmentSizes[segmentSizesLen - 2] = |
| static_cast<int32_t>(kernelOperands.size()); |
| prop.operandSegmentSizes[segmentSizesLen - 1] = 0; |
| } |
| |
| void LaunchFuncOp::build(OpBuilder &builder, OperationState &result, |
| SymbolRefAttr kernel, KernelDim3 gridSize, |
| KernelDim3 getBlockSize, Value dynamicSharedMemorySize, |
| ValueRange kernelOperands, Value asyncObject, |
| std::optional<KernelDim3> clusterSize) { |
| // Add grid and block sizes as op operands, followed by the data operands. |
| result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x, |
| getBlockSize.y, getBlockSize.z}); |
| if (clusterSize.has_value()) |
| result.addOperands({clusterSize->x, clusterSize->y, clusterSize->z}); |
| if (dynamicSharedMemorySize) |
| result.addOperands(dynamicSharedMemorySize); |
| result.addOperands(kernelOperands); |
| if (asyncObject) |
| result.addOperands(asyncObject); |
| Properties &prop = result.getOrAddProperties<Properties>(); |
| prop.kernel = kernel; |
| size_t segmentSizesLen = std::size(prop.operandSegmentSizes); |
| // Initialize the segment sizes to 1. |
| for (auto &sz : prop.operandSegmentSizes) |
| sz = 1; |
| prop.operandSegmentSizes[0] = 0; |
| if (!clusterSize.has_value()) { |
| prop.operandSegmentSizes[segmentSizesLen - 4] = 0; |
| prop.operandSegmentSizes[segmentSizesLen - 5] = 0; |
| prop.operandSegmentSizes[segmentSizesLen - 6] = 0; |
| } |
| prop.operandSegmentSizes[segmentSizesLen - 3] = |
| dynamicSharedMemorySize ? 1 : 0; |
| prop.operandSegmentSizes[segmentSizesLen - 2] = |
| static_cast<int32_t>(kernelOperands.size()); |
| prop.operandSegmentSizes[segmentSizesLen - 1] = asyncObject ? 1 : 0; |
| } |
| |
| StringAttr LaunchFuncOp::getKernelModuleName() { |
| return getKernel().getRootReference(); |
| } |
| |
| StringAttr LaunchFuncOp::getKernelName() { |
| return getKernel().getLeafReference(); |
| } |
| |
| unsigned LaunchFuncOp::getNumKernelOperands() { |
| return getKernelOperands().size(); |
| } |
| |
| Value LaunchFuncOp::getKernelOperand(unsigned i) { |
| return getKernelOperands()[i]; |
| } |
| |
| KernelDim3 LaunchFuncOp::getGridSizeOperandValues() { |
| auto operands = getOperands().drop_front(getAsyncDependencies().size()); |
| return KernelDim3{operands[0], operands[1], operands[2]}; |
| } |
| |
| KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() { |
| auto operands = getOperands().drop_front(getAsyncDependencies().size()); |
| return KernelDim3{operands[3], operands[4], operands[5]}; |
| } |
| |
| KernelDim3 LaunchFuncOp::getClusterSizeOperandValues() { |
| assert(hasClusterSize() && |
| "cluster size is not set, check hasClusterSize() first"); |
| auto operands = getOperands().drop_front(getAsyncDependencies().size()); |
| return KernelDim3{operands[6], operands[7], operands[8]}; |
| } |
| |
| LogicalResult LaunchFuncOp::verify() { |
| auto module = (*this)->getParentOfType<ModuleOp>(); |
| if (!module) |
| return emitOpError("expected to belong to a module"); |
| |
| if (!module->getAttrOfType<UnitAttr>( |
| GPUDialect::getContainerModuleAttrName())) |
| return emitOpError("expected the closest surrounding module to have the '" + |
| GPUDialect::getContainerModuleAttrName() + |
| "' attribute"); |
| |
| if (hasClusterSize()) { |
| if (getClusterSizeY().getType() != getClusterSizeX().getType() || |
| getClusterSizeZ().getType() != getClusterSizeX().getType()) |
| return emitOpError() |
| << "expects types of the cluster dimensions must be the same"; |
| } |
| |
| return success(); |
| } |
| |
| static ParseResult |
| parseLaunchDimType(OpAsmParser &parser, Type &dimTy, |
| std::optional<OpAsmParser::UnresolvedOperand> clusterValue, |
| Type &clusterXTy, Type &clusterYTy, Type &clusterZTy) { |
| if (succeeded(parser.parseOptionalColon())) { |
| if (parser.parseType(dimTy)) |
| return failure(); |
| } else { |
| dimTy = IndexType::get(parser.getContext()); |
| } |
| if (clusterValue.has_value()) { |
| clusterXTy = clusterYTy = clusterZTy = dimTy; |
| } |
| return success(); |
| } |
| |
| static void printLaunchDimType(OpAsmPrinter &printer, Operation *op, Type dimTy, |
| Value clusterValue, Type clusterXTy, |
| Type clusterYTy, Type clusterZTy) { |
| if (!dimTy.isIndex()) |
| printer << ": " << dimTy; |
| } |
| |
| static ParseResult parseLaunchFuncOperands( |
| OpAsmParser &parser, |
| SmallVectorImpl<OpAsmParser::UnresolvedOperand> &argNames, |
| SmallVectorImpl<Type> &argTypes) { |
| if (parser.parseOptionalKeyword("args")) |
| return success(); |
| |
| auto parseElement = [&]() -> ParseResult { |
| return failure(parser.parseOperand(argNames.emplace_back()) || |
| parser.parseColonType(argTypes.emplace_back())); |
| }; |
| |
| return parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Paren, |
| parseElement, " in argument list"); |
| } |
| |
| static void printLaunchFuncOperands(OpAsmPrinter &printer, Operation *, |
| OperandRange operands, TypeRange types) { |
| if (operands.empty()) |
| return; |
| printer << "args("; |
| llvm::interleaveComma(llvm::zip(operands, types), printer, |
| [&](const auto &pair) { |
| printer.printOperand(std::get<0>(pair)); |
| printer << " : "; |
| printer.printType(std::get<1>(pair)); |
| }); |
| printer << ")"; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // ShuffleOp |
| //===----------------------------------------------------------------------===// |
| |
| void ShuffleOp::build(OpBuilder &builder, OperationState &result, Value value, |
| int32_t offset, int32_t width, ShuffleMode mode) { |
| build(builder, result, value, |
| builder.create<arith::ConstantOp>(result.location, |
| builder.getI32IntegerAttr(offset)), |
| builder.create<arith::ConstantOp>(result.location, |
| builder.getI32IntegerAttr(width)), |
| mode); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // BarrierOp |
| //===----------------------------------------------------------------------===// |
| |
| namespace { |
| |
| /// Remove gpu.barrier after gpu.barrier, the threads are already synchronized! |
| LogicalResult eraseRedundantGpuBarrierOps(BarrierOp op, |
| PatternRewriter &rewriter) { |
| if (isa_and_nonnull<BarrierOp>(op->getNextNode())) { |
| rewriter.eraseOp(op); |
| return success(); |
| } |
| return failure(); |
| } |
| |
| } // end anonymous namespace |
| |
| void BarrierOp::getCanonicalizationPatterns(RewritePatternSet &results, |
| MLIRContext *context) { |
| results.add(eraseRedundantGpuBarrierOps); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPUFuncOp |
| //===----------------------------------------------------------------------===// |
| |
| /// Adds a new block argument that corresponds to buffers located in |
| /// workgroup memory. |
| BlockArgument GPUFuncOp::addWorkgroupAttribution(Type type, Location loc) { |
| auto attrName = getNumWorkgroupAttributionsAttrName(); |
| auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName); |
| (*this)->setAttr(attrName, |
| IntegerAttr::get(attr.getType(), attr.getValue() + 1)); |
| return getBody().insertArgument( |
| getFunctionType().getNumInputs() + attr.getInt(), type, loc); |
| } |
| |
| /// Adds a new block argument that corresponds to buffers located in |
| /// private memory. |
| BlockArgument GPUFuncOp::addPrivateAttribution(Type type, Location loc) { |
| // Buffers on the private memory always come after buffers on the workgroup |
| // memory. |
| return getBody().addArgument(type, loc); |
| } |
| |
| void GPUFuncOp::build(OpBuilder &builder, OperationState &result, |
| StringRef name, FunctionType type, |
| TypeRange workgroupAttributions, |
| TypeRange privateAttributions, |
| ArrayRef<NamedAttribute> attrs) { |
| result.addAttribute(SymbolTable::getSymbolAttrName(), |
| builder.getStringAttr(name)); |
| result.addAttribute(getFunctionTypeAttrName(result.name), |
| TypeAttr::get(type)); |
| result.addAttribute(getNumWorkgroupAttributionsAttrName(), |
| builder.getI64IntegerAttr(workgroupAttributions.size())); |
| result.addAttributes(attrs); |
| Region *body = result.addRegion(); |
| Block *entryBlock = new Block; |
| |
| // TODO: Allow passing in proper locations here. |
| for (Type argTy : type.getInputs()) |
| entryBlock->addArgument(argTy, result.location); |
| for (Type argTy : workgroupAttributions) |
| entryBlock->addArgument(argTy, result.location); |
| for (Type argTy : privateAttributions) |
| entryBlock->addArgument(argTy, result.location); |
| |
| body->getBlocks().push_back(entryBlock); |
| } |
| |
| /// Parses a GPU function memory attribution. |
| /// |
| /// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? |
| /// (`private` `(` ssa-id-and-type-list `)`)? |
| /// |
| /// Note that this function parses only one of the two similar parts, with the |
| /// keyword provided as argument. |
| static ParseResult |
| parseAttributions(OpAsmParser &parser, StringRef keyword, |
| SmallVectorImpl<OpAsmParser::Argument> &args, |
| Attribute &attributionAttrs) { |
| // If we could not parse the keyword, just assume empty list and succeed. |
| if (failed(parser.parseOptionalKeyword(keyword))) |
| return success(); |
| |
| size_t existingArgs = args.size(); |
| ParseResult result = |
| parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren, |
| /*allowType=*/true, /*allowAttrs=*/true); |
| if (failed(result)) |
| return result; |
| |
| bool hadAttrs = llvm::any_of(ArrayRef(args).drop_front(existingArgs), |
| [](const OpAsmParser::Argument &arg) -> bool { |
| return arg.attrs && !arg.attrs.empty(); |
| }); |
| if (!hadAttrs) { |
| attributionAttrs = nullptr; |
| return result; |
| } |
| |
| Builder &builder = parser.getBuilder(); |
| SmallVector<Attribute> attributionAttrsVec; |
| for (const auto &argument : ArrayRef(args).drop_front(existingArgs)) { |
| if (!argument.attrs) |
| attributionAttrsVec.push_back(builder.getDictionaryAttr({})); |
| else |
| attributionAttrsVec.push_back(argument.attrs); |
| } |
| attributionAttrs = builder.getArrayAttr(attributionAttrsVec); |
| return result; |
| } |
| |
| /// Parses a GPU function. |
| /// |
| /// <operation> ::= `gpu.func` symbol-ref-id `(` argument-list `)` |
| /// (`->` function-result-list)? memory-attribution `kernel`? |
| /// function-attributes? region |
| ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) { |
| SmallVector<OpAsmParser::Argument> entryArgs; |
| SmallVector<DictionaryAttr> resultAttrs; |
| SmallVector<Type> resultTypes; |
| bool isVariadic; |
| |
| // Parse the function name. |
| StringAttr nameAttr; |
| if (parser.parseSymbolName(nameAttr, ::mlir::SymbolTable::getSymbolAttrName(), |
| result.attributes)) |
| return failure(); |
| |
| auto signatureLocation = parser.getCurrentLocation(); |
| if (failed(function_interface_impl::parseFunctionSignature( |
| parser, /*allowVariadic=*/false, entryArgs, isVariadic, resultTypes, |
| resultAttrs))) |
| return failure(); |
| |
| if (!entryArgs.empty() && entryArgs[0].ssaName.name.empty()) |
| return parser.emitError(signatureLocation) |
| << "gpu.func requires named arguments"; |
| |
| // Construct the function type. More types will be added to the region, but |
| // not to the function type. |
| Builder &builder = parser.getBuilder(); |
| |
| SmallVector<Type> argTypes; |
| for (auto &arg : entryArgs) |
| argTypes.push_back(arg.type); |
| auto type = builder.getFunctionType(argTypes, resultTypes); |
| result.addAttribute(getFunctionTypeAttrName(result.name), |
| TypeAttr::get(type)); |
| |
| function_interface_impl::addArgAndResultAttrs( |
| builder, result, entryArgs, resultAttrs, getArgAttrsAttrName(result.name), |
| getResAttrsAttrName(result.name)); |
| |
| Attribute workgroupAttributionAttrs; |
| // Parse workgroup memory attributions. |
| if (failed(parseAttributions(parser, GPUFuncOp::getWorkgroupKeyword(), |
| entryArgs, workgroupAttributionAttrs))) |
| return failure(); |
| |
| // Store the number of operands we just parsed as the number of workgroup |
| // memory attributions. |
| unsigned numWorkgroupAttrs = entryArgs.size() - type.getNumInputs(); |
| result.addAttribute(GPUFuncOp::getNumWorkgroupAttributionsAttrName(), |
| builder.getI64IntegerAttr(numWorkgroupAttrs)); |
| if (workgroupAttributionAttrs) |
| result.addAttribute(GPUFuncOp::getWorkgroupAttribAttrsAttrName(result.name), |
| workgroupAttributionAttrs); |
| |
| Attribute privateAttributionAttrs; |
| // Parse private memory attributions. |
| if (failed(parseAttributions(parser, GPUFuncOp::getPrivateKeyword(), |
| entryArgs, privateAttributionAttrs))) |
| return failure(); |
| if (privateAttributionAttrs) |
| result.addAttribute(GPUFuncOp::getPrivateAttribAttrsAttrName(result.name), |
| privateAttributionAttrs); |
| |
| // Parse the kernel attribute if present. |
| if (succeeded(parser.parseOptionalKeyword(GPUFuncOp::getKernelKeyword()))) |
| result.addAttribute(GPUDialect::getKernelFuncAttrName(), |
| builder.getUnitAttr()); |
| |
| // Parse attributes. |
| if (failed(parser.parseOptionalAttrDictWithKeyword(result.attributes))) |
| return failure(); |
| |
| // Parse the region. If no argument names were provided, take all names |
| // (including those of attributions) from the entry block. |
| auto *body = result.addRegion(); |
| return parser.parseRegion(*body, entryArgs); |
| } |
| |
| static void printAttributions(OpAsmPrinter &p, StringRef keyword, |
| ArrayRef<BlockArgument> values, |
| ArrayAttr attributes) { |
| if (values.empty()) |
| return; |
| |
| p << ' ' << keyword << '('; |
| llvm::interleaveComma( |
| llvm::enumerate(values), p, [&p, attributes](auto pair) { |
| BlockArgument v = pair.value(); |
| p << v << " : " << v.getType(); |
| |
| size_t attributionIndex = pair.index(); |
| DictionaryAttr attrs; |
| if (attributes && attributionIndex < attributes.size()) |
| attrs = llvm::cast<DictionaryAttr>(attributes[attributionIndex]); |
| if (attrs) |
| p.printOptionalAttrDict(attrs.getValue()); |
| }); |
| p << ')'; |
| } |
| |
| void GPUFuncOp::print(OpAsmPrinter &p) { |
| p << ' '; |
| p.printSymbolName(getName()); |
| |
| FunctionType type = getFunctionType(); |
| function_interface_impl::printFunctionSignature(p, *this, type.getInputs(), |
| /*isVariadic=*/false, |
| type.getResults()); |
| |
| printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions(), |
| getWorkgroupAttribAttrs().value_or(nullptr)); |
| printAttributions(p, getPrivateKeyword(), getPrivateAttributions(), |
| getPrivateAttribAttrs().value_or(nullptr)); |
| if (isKernel()) |
| p << ' ' << getKernelKeyword(); |
| |
| function_interface_impl::printFunctionAttributes( |
| p, *this, |
| {getNumWorkgroupAttributionsAttrName(), |
| GPUDialect::getKernelFuncAttrName(), getFunctionTypeAttrName(), |
| getArgAttrsAttrName(), getResAttrsAttrName(), |
| getWorkgroupAttribAttrsAttrName(), getPrivateAttribAttrsAttrName()}); |
| p << ' '; |
| p.printRegion(getBody(), /*printEntryBlockArgs=*/false); |
| } |
| |
| static DictionaryAttr getAttributionAttrs(GPUFuncOp op, unsigned index, |
| StringAttr attrName) { |
| auto allAttrs = llvm::dyn_cast_or_null<ArrayAttr>(op->getAttr(attrName)); |
| if (!allAttrs || index >= allAttrs.size()) |
| return DictionaryAttr(); |
| return llvm::cast<DictionaryAttr>(allAttrs[index]); |
| } |
| |
| DictionaryAttr GPUFuncOp::getworkgroupAttributionAttrs(unsigned index) { |
| return getAttributionAttrs(*this, index, getWorkgroupAttribAttrsAttrName()); |
| } |
| |
| DictionaryAttr GPUFuncOp::getPrivateAttributionAttrs(unsigned index) { |
| return getAttributionAttrs(*this, index, getPrivateAttribAttrsAttrName()); |
| } |
| |
| static void setAttributionAttrs(GPUFuncOp op, unsigned index, |
| DictionaryAttr value, StringAttr attrName) { |
| MLIRContext *ctx = op.getContext(); |
| auto allAttrs = llvm::dyn_cast_or_null<ArrayAttr>(op->getAttr(attrName)); |
| SmallVector<Attribute> elements; |
| if (allAttrs) |
| elements.append(allAttrs.begin(), allAttrs.end()); |
| while (elements.size() <= index) |
| elements.push_back(DictionaryAttr::get(ctx)); |
| if (!value) |
| elements[index] = DictionaryAttr::get(ctx); |
| else |
| elements[index] = value; |
| ArrayAttr newValue = ArrayAttr::get(ctx, elements); |
| op->setAttr(attrName, newValue); |
| } |
| |
| void GPUFuncOp::setworkgroupAttributionAttrs(unsigned index, |
| DictionaryAttr value) { |
| setAttributionAttrs(*this, index, value, getWorkgroupAttribAttrsAttrName()); |
| } |
| |
| void GPUFuncOp::setPrivateAttributionAttrs(unsigned int index, |
| DictionaryAttr value) { |
| setAttributionAttrs(*this, index, value, getPrivateAttribAttrsAttrName()); |
| } |
| |
| static Attribute getAttributionAttr(GPUFuncOp op, unsigned index, |
| StringAttr name, StringAttr attrsName) { |
| DictionaryAttr dict = getAttributionAttrs(op, index, attrsName); |
| if (!dict) |
| return Attribute(); |
| return dict.get(name); |
| } |
| |
| Attribute GPUFuncOp::getWorkgroupAttributionAttr(unsigned index, |
| StringAttr name) { |
| assert(index < getNumWorkgroupAttributions() && |
| "index must map to a workgroup attribution"); |
| return getAttributionAttr(*this, index, name, |
| getWorkgroupAttribAttrsAttrName()); |
| } |
| |
| Attribute GPUFuncOp::getPrivateAttributionAttr(unsigned index, |
| StringAttr name) { |
| assert(index < getNumPrivateAttributions() && |
| "index must map to a private attribution"); |
| return getAttributionAttr(*this, index, name, |
| getPrivateAttribAttrsAttrName()); |
| } |
| |
| static void setAttributionAttr(GPUFuncOp op, unsigned index, StringAttr name, |
| Attribute value, StringAttr attrsName) { |
| MLIRContext *ctx = op.getContext(); |
| SmallVector<NamedAttribute> elems; |
| DictionaryAttr oldDict = getAttributionAttrs(op, index, attrsName); |
| if (oldDict) |
| elems.append(oldDict.getValue().begin(), oldDict.getValue().end()); |
| |
| bool found = false; |
| bool mustSort = true; |
| for (unsigned i = 0, e = elems.size(); i < e; ++i) { |
| if (elems[i].getName() == name) { |
| found = true; |
| if (!value) { |
| std::swap(elems[i], elems[elems.size() - 1]); |
| elems.pop_back(); |
| } else { |
| mustSort = false; |
| elems[i] = NamedAttribute(elems[i].getName(), value); |
| } |
| break; |
| } |
| } |
| if (!found) { |
| if (!value) |
| return; |
| elems.emplace_back(name, value); |
| } |
| if (mustSort) { |
| DictionaryAttr::sortInPlace(elems); |
| } |
| auto newDict = DictionaryAttr::getWithSorted(ctx, elems); |
| setAttributionAttrs(op, index, newDict, attrsName); |
| } |
| |
| void GPUFuncOp::setWorkgroupAttributionAttr(unsigned index, StringAttr name, |
| Attribute value) { |
| assert(index < getNumWorkgroupAttributions() && |
| "index must map to a workgroup attribution"); |
| setAttributionAttr(*this, index, name, value, |
| getWorkgroupAttribAttrsAttrName()); |
| } |
| |
| void GPUFuncOp::setPrivateAttributionAttr(unsigned index, StringAttr name, |
| Attribute value) { |
| assert(index < getNumPrivateAttributions() && |
| "index must map to a private attribution"); |
| setAttributionAttr(*this, index, name, value, |
| getPrivateAttribAttrsAttrName()); |
| } |
| |
| LogicalResult GPUFuncOp::verifyType() { |
| if (isKernel() && getFunctionType().getNumResults() != 0) |
| return emitOpError() << "expected void return type for kernel function"; |
| |
| return success(); |
| } |
| |
| /// Verifies the body of the function. |
| LogicalResult GPUFuncOp::verifyBody() { |
| if (empty()) |
| return emitOpError() << "expected body with at least one block"; |
| unsigned numFuncArguments = getNumArguments(); |
| unsigned numWorkgroupAttributions = getNumWorkgroupAttributions(); |
| unsigned numBlockArguments = front().getNumArguments(); |
| if (numBlockArguments < numFuncArguments + numWorkgroupAttributions) |
| return emitOpError() << "expected at least " |
| << numFuncArguments + numWorkgroupAttributions |
| << " arguments to body region"; |
| |
| ArrayRef<Type> funcArgTypes = getFunctionType().getInputs(); |
| for (unsigned i = 0; i < numFuncArguments; ++i) { |
| Type blockArgType = front().getArgument(i).getType(); |
| if (funcArgTypes[i] != blockArgType) |
| return emitOpError() << "expected body region argument #" << i |
| << " to be of type " << funcArgTypes[i] << ", got " |
| << blockArgType; |
| } |
| |
| if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(), |
| GPUDialect::getWorkgroupAddressSpace())) || |
| failed(verifyAttributions(getOperation(), getPrivateAttributions(), |
| GPUDialect::getPrivateAddressSpace()))) |
| return failure(); |
| |
| return success(); |
| } |
| |
| static LogicalResult verifyKnownLaunchSizeAttr(gpu::GPUFuncOp op, |
| StringRef attrName) { |
| auto maybeAttr = op->getAttr(attrName); |
| if (!maybeAttr) |
| return success(); |
| auto array = llvm::dyn_cast<DenseI32ArrayAttr>(maybeAttr); |
| if (!array) |
| return op.emitOpError(attrName + " must be a dense i32 array"); |
| if (array.size() != 3) |
| return op.emitOpError(attrName + " must contain exactly 3 elements"); |
| return success(); |
| } |
| |
| LogicalResult GPUFuncOp::verify() { |
| if (failed(verifyKnownLaunchSizeAttr(*this, getKnownBlockSizeAttrName()))) |
| return failure(); |
| if (failed(verifyKnownLaunchSizeAttr(*this, getKnownGridSizeAttrName()))) |
| return failure(); |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // ReturnOp |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult gpu::ReturnOp::verify() { |
| GPUFuncOp function = (*this)->getParentOfType<GPUFuncOp>(); |
| |
| FunctionType funType = function.getFunctionType(); |
| |
| if (funType.getNumResults() != getOperands().size()) |
| return emitOpError() |
| .append("expected ", funType.getNumResults(), " result operands") |
| .attachNote(function.getLoc()) |
| .append("return type declared here"); |
| |
| for (const auto &pair : llvm::enumerate( |
| llvm::zip(function.getFunctionType().getResults(), getOperands()))) { |
| auto [type, operand] = pair.value(); |
| if (type != operand.getType()) |
| return emitOpError() << "unexpected type `" << operand.getType() |
| << "' for operand #" << pair.index(); |
| } |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPUModuleOp |
| //===----------------------------------------------------------------------===// |
| |
| void GPUModuleOp::build(OpBuilder &builder, OperationState &result, |
| StringRef name, ArrayAttr targets) { |
| ensureTerminator(*result.addRegion(), builder, result.location); |
| result.attributes.push_back(builder.getNamedAttr( |
| ::mlir::SymbolTable::getSymbolAttrName(), builder.getStringAttr(name))); |
| |
| if (targets) |
| result.getOrAddProperties<Properties>().targets = targets; |
| } |
| |
| void GPUModuleOp::build(OpBuilder &builder, OperationState &result, |
| StringRef name, ArrayRef<Attribute> targets) { |
| build(builder, result, name, |
| targets.empty() ? ArrayAttr() : builder.getArrayAttr(targets)); |
| } |
| |
| ParseResult GPUModuleOp::parse(OpAsmParser &parser, OperationState &result) { |
| StringAttr nameAttr; |
| ArrayAttr targetsAttr; |
| |
| if (parser.parseSymbolName(nameAttr, mlir::SymbolTable::getSymbolAttrName(), |
| result.attributes)) |
| return failure(); |
| |
| // Parse the optional array of target attributes. |
| OptionalParseResult targetsAttrResult = |
| parser.parseOptionalAttribute(targetsAttr, Type{}); |
| if (targetsAttrResult.has_value()) { |
| if (failed(*targetsAttrResult)) { |
| return failure(); |
| } |
| result.getOrAddProperties<Properties>().targets = targetsAttr; |
| } |
| |
| // If module attributes are present, parse them. |
| if (parser.parseOptionalAttrDictWithKeyword(result.attributes)) |
| return failure(); |
| |
| // Parse the module body. |
| auto *body = result.addRegion(); |
| if (parser.parseRegion(*body, {})) |
| return failure(); |
| |
| // Ensure that this module has a valid terminator. |
| GPUModuleOp::ensureTerminator(*body, parser.getBuilder(), result.location); |
| return success(); |
| } |
| |
| void GPUModuleOp::print(OpAsmPrinter &p) { |
| p << ' '; |
| p.printSymbolName(getName()); |
| |
| if (Attribute attr = getTargetsAttr()) { |
| p << ' '; |
| p.printAttribute(attr); |
| p << ' '; |
| } |
| |
| p.printOptionalAttrDictWithKeyword( |
| (*this)->getAttrs(), |
| {mlir::SymbolTable::getSymbolAttrName(), getTargetsAttrName()}); |
| p << ' '; |
| p.printRegion(getRegion(), /*printEntryBlockArgs=*/false, |
| /*printBlockTerminators=*/false); |
| } |
| |
| bool GPUModuleOp::hasTarget(Attribute target) { |
| if (ArrayAttr targets = getTargetsAttr()) |
| return llvm::count(targets.getValue(), target); |
| return false; |
| } |
| |
| void GPUModuleOp::setTargets(ArrayRef<TargetAttrInterface> targets) { |
| ArrayAttr &targetsAttr = getProperties().targets; |
| SmallVector<Attribute> targetsVector(targets); |
| targetsAttr = ArrayAttr::get(getContext(), targetsVector); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPUBinaryOp |
| //===----------------------------------------------------------------------===// |
| void BinaryOp::build(OpBuilder &builder, OperationState &result, StringRef name, |
| Attribute offloadingHandler, ArrayAttr objects) { |
| auto &properties = result.getOrAddProperties<Properties>(); |
| result.attributes.push_back(builder.getNamedAttr( |
| SymbolTable::getSymbolAttrName(), builder.getStringAttr(name))); |
| properties.objects = objects; |
| if (offloadingHandler) |
| properties.offloadingHandler = offloadingHandler; |
| else |
| properties.offloadingHandler = builder.getAttr<SelectObjectAttr>(nullptr); |
| } |
| |
| void BinaryOp::build(OpBuilder &builder, OperationState &result, StringRef name, |
| Attribute offloadingHandler, ArrayRef<Attribute> objects) { |
| build(builder, result, name, offloadingHandler, |
| objects.empty() ? ArrayAttr() : builder.getArrayAttr(objects)); |
| } |
| |
| static ParseResult parseOffloadingHandler(OpAsmParser &parser, |
| Attribute &offloadingHandler) { |
| if (succeeded(parser.parseOptionalLess())) { |
| if (parser.parseAttribute(offloadingHandler)) |
| return failure(); |
| if (parser.parseGreater()) |
| return failure(); |
| } |
| if (!offloadingHandler) |
| offloadingHandler = parser.getBuilder().getAttr<SelectObjectAttr>(nullptr); |
| return success(); |
| } |
| |
| static void printOffloadingHandler(OpAsmPrinter &printer, Operation *op, |
| Attribute offloadingHandler) { |
| if (offloadingHandler != SelectObjectAttr::get(op->getContext(), nullptr)) |
| printer << '<' << offloadingHandler << '>'; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPUMemcpyOp |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult MemcpyOp::verify() { |
| auto srcType = getSrc().getType(); |
| auto dstType = getDst().getType(); |
| |
| if (getElementTypeOrSelf(srcType) != getElementTypeOrSelf(dstType)) |
| return emitOpError("arguments have incompatible element type"); |
| |
| if (failed(verifyCompatibleShape(srcType, dstType))) |
| return emitOpError("arguments have incompatible shape"); |
| |
| return success(); |
| } |
| |
| namespace { |
| |
| /// Erases a common case of copy ops where a destination value is used only by |
| /// the copy op, alloc and dealloc ops. |
| struct EraseTrivialCopyOp : public OpRewritePattern<MemcpyOp> { |
| using OpRewritePattern<MemcpyOp>::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(MemcpyOp op, |
| PatternRewriter &rewriter) const override { |
| Value dest = op.getDst(); |
| Operation *destDefOp = dest.getDefiningOp(); |
| // `dest` must be defined by an op having Allocate memory effect in order to |
| // perform the folding. |
| if (!destDefOp || |
| !hasSingleEffect<MemoryEffects::Allocate>(destDefOp, dest)) |
| return failure(); |
| // We can erase `op` iff `dest` has no other use apart from its |
| // use by `op` and dealloc ops. |
| if (llvm::any_of(dest.getUsers(), [op, dest](Operation *user) { |
| return user != op && |
| !hasSingleEffect<MemoryEffects::Free>(user, dest); |
| })) |
| return failure(); |
| // We can perform the folding if and only if op has a single async |
| // dependency and produces an async token as result, or if it does not have |
| // any async dependency and does not produce any async token result. |
| if (op.getAsyncDependencies().size() > 1 || |
| ((op.getAsyncDependencies().empty() && op.getAsyncToken()) || |
| (!op.getAsyncDependencies().empty() && !op.getAsyncToken()))) |
| return failure(); |
| rewriter.replaceOp(op, op.getAsyncDependencies()); |
| return success(); |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| void MemcpyOp::getCanonicalizationPatterns(RewritePatternSet &results, |
| MLIRContext *context) { |
| results.add<EraseTrivialCopyOp>(context); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPU_SubgroupMmaLoadMatrixOp |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult SubgroupMmaLoadMatrixOp::verify() { |
| auto srcType = getSrcMemref().getType(); |
| auto resType = getRes().getType(); |
| auto resMatrixType = llvm::cast<gpu::MMAMatrixType>(resType); |
| auto operand = resMatrixType.getOperand(); |
| auto srcMemrefType = llvm::cast<MemRefType>(srcType); |
| |
| if (!isLastMemrefDimUnitStride(srcMemrefType)) |
| return emitError( |
| "expected source memref most minor dim must have unit stride"); |
| |
| if (!operand.equals("AOp") && !operand.equals("BOp") && |
| !operand.equals("COp")) |
| return emitError("only AOp, BOp and COp can be loaded"); |
| |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPU_SubgroupMmaStoreMatrixOp |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult SubgroupMmaStoreMatrixOp::verify() { |
| auto srcType = getSrc().getType(); |
| auto dstType = getDstMemref().getType(); |
| auto srcMatrixType = llvm::cast<gpu::MMAMatrixType>(srcType); |
| auto dstMemrefType = llvm::cast<MemRefType>(dstType); |
| |
| if (!isLastMemrefDimUnitStride(dstMemrefType)) |
| return emitError( |
| "expected destination memref most minor dim must have unit stride"); |
| |
| if (!srcMatrixType.getOperand().equals("COp")) |
| return emitError( |
| "expected the operand matrix being stored to have 'COp' operand type"); |
| |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPU_SubgroupMmaComputeOp |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult SubgroupMmaComputeOp::verify() { |
| enum OperandMap { A, B, C }; |
| SmallVector<MMAMatrixType, 3> opTypes; |
| opTypes.push_back(llvm::cast<MMAMatrixType>(getOpA().getType())); |
| opTypes.push_back(llvm::cast<MMAMatrixType>(getOpB().getType())); |
| opTypes.push_back(llvm::cast<MMAMatrixType>(getOpC().getType())); |
| |
| if (!opTypes[A].getOperand().equals("AOp") || |
| !opTypes[B].getOperand().equals("BOp") || |
| !opTypes[C].getOperand().equals("COp")) |
| return emitError("operands must be in the order AOp, BOp, COp"); |
| |
| ArrayRef<int64_t> aShape, bShape, cShape; |
| aShape = opTypes[A].getShape(); |
| bShape = opTypes[B].getShape(); |
| cShape = opTypes[C].getShape(); |
| |
| if (aShape[1] != bShape[0] || aShape[0] != cShape[0] || |
| bShape[1] != cShape[1]) |
| return emitError("operand shapes do not satisfy matmul constraints"); |
| |
| return success(); |
| } |
| |
| LogicalResult MemcpyOp::fold(FoldAdaptor adaptor, |
| SmallVectorImpl<::mlir::OpFoldResult> &results) { |
| return memref::foldMemRefCast(*this); |
| } |
| |
| LogicalResult MemsetOp::fold(FoldAdaptor adaptor, |
| SmallVectorImpl<::mlir::OpFoldResult> &results) { |
| return memref::foldMemRefCast(*this); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPU_WaitOp |
| //===----------------------------------------------------------------------===// |
| |
| namespace { |
| |
| /// Remove gpu.wait op use of gpu.wait op def without async dependencies. |
| /// %t = gpu.wait async [] // No async dependencies. |
| /// ... gpu.wait ... [%t, ...] // %t can be removed. |
| struct EraseRedundantGpuWaitOpPairs : public OpRewritePattern<WaitOp> { |
| public: |
| using OpRewritePattern::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(WaitOp op, |
| PatternRewriter &rewriter) const final { |
| auto predicate = [](Value value) { |
| auto waitOp = value.getDefiningOp<WaitOp>(); |
| return waitOp && waitOp->getNumOperands() == 0; |
| }; |
| if (llvm::none_of(op.getAsyncDependencies(), predicate)) |
| return failure(); |
| SmallVector<Value> validOperands; |
| for (Value operand : op->getOperands()) { |
| if (predicate(operand)) |
| continue; |
| validOperands.push_back(operand); |
| } |
| rewriter.updateRootInPlace(op, [&]() { op->setOperands(validOperands); }); |
| return success(); |
| } |
| }; |
| |
| /// Simplify trivial gpu.wait ops for the following patterns. |
| /// 1. %t = gpu.wait async ... ops, where %t has no uses (regardless of async |
| /// dependencies). |
| /// 2. %t1 = gpu.wait async [%t0], in this case, we can replace uses of %t1 with |
| /// %t0. |
| /// 3. gpu.wait [] ops, i.e gpu.wait ops that neither have any async |
| /// dependencies nor return any token. |
| struct SimplifyGpuWaitOp : public OpRewritePattern<WaitOp> { |
| public: |
| using OpRewritePattern::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(WaitOp op, |
| PatternRewriter &rewriter) const final { |
| // Erase gpu.wait ops that neither have any async dependencies nor return |
| // any async token. |
| if (op.getAsyncDependencies().empty() && !op.getAsyncToken()) { |
| rewriter.eraseOp(op); |
| return success(); |
| } |
| // Replace uses of %t1 = gpu.wait async [%t0] ops with %t0 and erase the op. |
| if (llvm::hasSingleElement(op.getAsyncDependencies()) && |
| op.getAsyncToken()) { |
| rewriter.replaceOp(op, op.getAsyncDependencies()); |
| return success(); |
| } |
| // Erase %t = gpu.wait async ... ops, where %t has no uses. |
| if (op.getAsyncToken() && op.getAsyncToken().use_empty()) { |
| rewriter.eraseOp(op); |
| return success(); |
| } |
| return failure(); |
| } |
| }; |
| |
| } // end anonymous namespace |
| |
| void WaitOp::getCanonicalizationPatterns(RewritePatternSet &results, |
| MLIRContext *context) { |
| results.add<EraseRedundantGpuWaitOpPairs, SimplifyGpuWaitOp>(context); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPU_AllocOp |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult AllocOp::verify() { |
| auto memRefType = llvm::cast<MemRefType>(getMemref().getType()); |
| |
| if (static_cast<int64_t>(getDynamicSizes().size()) != |
| memRefType.getNumDynamicDims()) |
| return emitOpError("dimension operand count does not equal memref " |
| "dynamic dimension count"); |
| |
| unsigned numSymbols = 0; |
| if (!memRefType.getLayout().isIdentity()) |
| numSymbols = memRefType.getLayout().getAffineMap().getNumSymbols(); |
| if (getSymbolOperands().size() != numSymbols) { |
| return emitOpError( |
| "symbol operand count does not equal memref symbol count"); |
| } |
| |
| return success(); |
| } |
| |
| namespace { |
| |
| /// Folding of memref.dim(gpu.alloc(%size), %idx) -> %size similar to |
| /// `memref::AllocOp`. |
| struct SimplifyDimOfAllocOp : public OpRewritePattern<memref::DimOp> { |
| using OpRewritePattern<memref::DimOp>::OpRewritePattern; |
| |
| LogicalResult matchAndRewrite(memref::DimOp dimOp, |
| PatternRewriter &rewriter) const override { |
| std::optional<int64_t> index = dimOp.getConstantIndex(); |
| if (!index) |
| return failure(); |
| |
| auto memrefType = llvm::dyn_cast<MemRefType>(dimOp.getSource().getType()); |
| if (!memrefType || !memrefType.isDynamicDim(index.value())) |
| return failure(); |
| |
| auto alloc = dimOp.getSource().getDefiningOp<AllocOp>(); |
| if (!alloc) |
| return failure(); |
| |
| Value substituteOp = *(alloc.getDynamicSizes().begin() + |
| memrefType.getDynamicDimIndex(index.value())); |
| rewriter.replaceOp(dimOp, substituteOp); |
| return success(); |
| } |
| }; |
| |
| } // namespace |
| |
| void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results, |
| MLIRContext *context) { |
| results.add<SimplifyDimOfAllocOp>(context); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPU object attribute |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError, |
| Attribute target, CompilationTarget format, |
| StringAttr object, DictionaryAttr properties) { |
| if (!target) |
| return emitError() << "the target attribute cannot be null"; |
| if (target.hasPromiseOrImplementsInterface<TargetAttrInterface>()) |
| return success(); |
| return emitError() << "the target attribute must implement or promise the " |
| "`gpu::TargetAttrInterface`"; |
| } |
| |
| namespace { |
| LogicalResult parseObject(AsmParser &odsParser, CompilationTarget &format, |
| StringAttr &object) { |
| std::optional<CompilationTarget> formatResult; |
| StringRef enumKeyword; |
| auto loc = odsParser.getCurrentLocation(); |
| if (failed(odsParser.parseOptionalKeyword(&enumKeyword))) |
| formatResult = CompilationTarget::Fatbin; |
| if (!formatResult && |
| (formatResult = |
| gpu::symbolizeEnum<gpu::CompilationTarget>(enumKeyword)) && |
| odsParser.parseEqual()) |
| return odsParser.emitError(loc, "expected an equal sign"); |
| if (!formatResult) |
| return odsParser.emitError(loc, "expected keyword for GPU object format"); |
| FailureOr<StringAttr> objectResult = |
| FieldParser<StringAttr>::parse(odsParser); |
| if (failed(objectResult)) |
| return odsParser.emitError(odsParser.getCurrentLocation(), |
| "failed to parse GPU_ObjectAttr parameter " |
| "'object' which is to be a `StringAttr`"); |
| format = *formatResult; |
| object = *objectResult; |
| return success(); |
| } |
| |
| void printObject(AsmPrinter &odsParser, CompilationTarget format, |
| StringAttr object) { |
| if (format != CompilationTarget::Fatbin) |
| odsParser << stringifyEnum(format) << " = "; |
| odsParser << object; |
| } |
| } // namespace |
| |
| //===----------------------------------------------------------------------===// |
| // GPU select object attribute |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult |
| gpu::SelectObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError, |
| Attribute target) { |
| // Check `target`, it can be null, an integer attr or a GPU Target attribute. |
| if (target) { |
| if (auto intAttr = mlir::dyn_cast<IntegerAttr>(target)) { |
| if (intAttr.getInt() < 0) { |
| return emitError() << "the object index must be positive"; |
| } |
| } else if (!target.hasPromiseOrImplementsInterface<TargetAttrInterface>()) { |
| return emitError() |
| << "the target attribute must be a GPU Target attribute"; |
| } |
| } |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // DynamicSharedMemoryOp |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult gpu::DynamicSharedMemoryOp::verify() { |
| if (!getOperation()->getParentWithTrait<OpTrait::SymbolTable>()) |
| return emitOpError() << "must be inside an op with symbol table"; |
| |
| MemRefType memrefType = getResultMemref().getType(); |
| // Check address space |
| if (!GPUDialect::hasWorkgroupMemoryAddressSpace(memrefType)) { |
| return emitOpError() << "address space must be " |
| << gpu::AddressSpaceAttr::getMnemonic() << "<" |
| << stringifyEnum(gpu::AddressSpace::Workgroup) << ">"; |
| } |
| if (memrefType.hasStaticShape()) { |
| return emitOpError() << "result memref type must be memref<?xi8, " |
| "#gpu.address_space<workgroup>>"; |
| } |
| return success(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPU target options |
| //===----------------------------------------------------------------------===// |
| |
| TargetOptions::TargetOptions( |
| StringRef toolkitPath, ArrayRef<std::string> linkFiles, |
| StringRef cmdOptions, CompilationTarget compilationTarget, |
| function_ref<SymbolTable *()> getSymbolTableCallback) |
| : TargetOptions(TypeID::get<TargetOptions>(), toolkitPath, linkFiles, |
| cmdOptions, compilationTarget, getSymbolTableCallback) {} |
| |
| TargetOptions::TargetOptions( |
| TypeID typeID, StringRef toolkitPath, ArrayRef<std::string> linkFiles, |
| StringRef cmdOptions, CompilationTarget compilationTarget, |
| function_ref<SymbolTable *()> getSymbolTableCallback) |
| : toolkitPath(toolkitPath.str()), linkFiles(linkFiles), |
| cmdOptions(cmdOptions.str()), compilationTarget(compilationTarget), |
| getSymbolTableCallback(getSymbolTableCallback), typeID(typeID) {} |
| |
| TypeID TargetOptions::getTypeID() const { return typeID; } |
| |
| StringRef TargetOptions::getToolkitPath() const { return toolkitPath; } |
| |
| ArrayRef<std::string> TargetOptions::getLinkFiles() const { return linkFiles; } |
| |
| StringRef TargetOptions::getCmdOptions() const { return cmdOptions; } |
| |
| SymbolTable *TargetOptions::getSymbolTable() const { |
| return getSymbolTableCallback ? getSymbolTableCallback() : nullptr; |
| } |
| |
| CompilationTarget TargetOptions::getCompilationTarget() const { |
| return compilationTarget; |
| } |
| |
| CompilationTarget TargetOptions::getDefaultCompilationTarget() { |
| return CompilationTarget::Fatbin; |
| } |
| |
| std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> |
| TargetOptions::tokenizeCmdOptions() const { |
| std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> options; |
| llvm::StringSaver stringSaver(options.first); |
| StringRef opts = cmdOptions; |
| // For a correct tokenization of the command line options `opts` must be |
| // unquoted, otherwise the tokenization function returns a single string: the |
| // unquoted `cmdOptions` -which is not the desired behavior. |
| // Remove any quotes if they are at the beginning and end of the string: |
| if (!opts.empty() && opts.front() == '"' && opts.back() == '"') |
| opts.consume_front("\""), opts.consume_back("\""); |
| if (!opts.empty() && opts.front() == '\'' && opts.back() == '\'') |
| opts.consume_front("'"), opts.consume_back("'"); |
| #ifdef _WIN32 |
| llvm::cl::TokenizeWindowsCommandLine(opts, stringSaver, options.second, |
| /*MarkEOLs=*/false); |
| #else |
| llvm::cl::TokenizeGNUCommandLine(opts, stringSaver, options.second, |
| /*MarkEOLs=*/false); |
| #endif // _WIN32 |
| return options; |
| } |
| |
| MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions) |
| |
| #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc" |
| #include "mlir/Dialect/GPU/IR/GPUOpsEnums.cpp.inc" |
| |
| #define GET_ATTRDEF_CLASSES |
| #include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc" |
| |
| #define GET_OP_CLASSES |
| #include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc" |
| |
| #include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.cpp.inc" |