| //===- LoopsToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h" |
| #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h" |
| #include "mlir/Dialect/AffineOps/AffineOps.h" |
| #include "mlir/Dialect/LoopOps/LoopOps.h" |
| #include "mlir/Dialect/StandardOps/IR/Ops.h" |
| #include "mlir/Pass/Pass.h" |
| |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/Support/CommandLine.h" |
| |
| #define PASS_NAME "convert-loops-to-gpu" |
| #define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu" |
| |
| using namespace mlir; |
| using namespace mlir::loop; |
| |
| static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options"); |
| static llvm::cl::opt<unsigned> |
| clNumBlockDims("gpu-block-dims", |
| llvm::cl::desc("Number of GPU block dimensions for mapping"), |
| llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u)); |
| static llvm::cl::opt<unsigned> clNumThreadDims( |
| "gpu-thread-dims", |
| llvm::cl::desc("Number of GPU thread dimensions for mapping"), |
| llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u)); |
| |
| static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME |
| " options"); |
| static llvm::cl::list<unsigned> |
| clNumWorkGroups("gpu-num-workgroups", |
| llvm::cl::desc("Num workgroups in the GPU launch"), |
| llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, |
| llvm::cl::cat(clLoopOpToGPUCategory)); |
| static llvm::cl::list<unsigned> |
| clWorkGroupSize("gpu-workgroup-size", |
| llvm::cl::desc("Workgroup Size in the GPU launch"), |
| llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, |
| llvm::cl::cat(clLoopOpToGPUCategory)); |
| |
| namespace { |
| // A pass that traverses top-level loops in the function and converts them to |
| // GPU launch operations. Nested launches are not allowed, so this does not |
| // walk the function recursively to avoid considering nested loops. |
| struct ForLoopMapper : public FunctionPass<ForLoopMapper> { |
| ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) |
| : numBlockDims(numBlockDims), numThreadDims(numThreadDims) {} |
| |
| void runOnFunction() override { |
| for (Block &block : getFunction()) |
| for (Operation &op : llvm::make_early_inc_range(block)) { |
| if (auto forOp = dyn_cast<AffineForOp>(&op)) { |
| if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims, |
| numThreadDims))) |
| signalPassFailure(); |
| } else if (auto forOp = dyn_cast<ForOp>(&op)) { |
| if (failed(convertLoopNestToGPULaunch(forOp, numBlockDims, |
| numThreadDims))) |
| signalPassFailure(); |
| } |
| } |
| } |
| |
| unsigned numBlockDims; |
| unsigned numThreadDims; |
| }; |
| |
| // A pass that traverses top-level loops in the function and convertes them to |
| // GPU launch operations. The top-level loops itself does not have to be |
| // perfectly nested. The only requirement is that there be as many perfectly |
| // nested loops as the size of `numWorkGroups`. Within these any loop nest has |
| // to be perfectly nested upto depth equal to size of `workGroupSize`. |
| struct ImperfectlyNestedForLoopMapper |
| : public FunctionPass<ImperfectlyNestedForLoopMapper> { |
| ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups, |
| ArrayRef<int64_t> workGroupSize) |
| : numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()), |
| workGroupSize(workGroupSize.begin(), workGroupSize.end()) {} |
| |
| void runOnFunction() override { |
| // Insert the num work groups and workgroup sizes as constant values. This |
| // pass is only used for testing. |
| FuncOp funcOp = getFunction(); |
| OpBuilder builder(funcOp.getOperation()->getRegion(0)); |
| SmallVector<Value, 3> numWorkGroupsVal, workGroupSizeVal; |
| for (auto val : numWorkGroups) { |
| auto constOp = builder.create<ConstantOp>( |
| funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); |
| numWorkGroupsVal.push_back(constOp); |
| } |
| for (auto val : workGroupSize) { |
| auto constOp = builder.create<ConstantOp>( |
| funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val)); |
| workGroupSizeVal.push_back(constOp); |
| } |
| for (Block &block : getFunction()) { |
| for (Operation &op : llvm::make_early_inc_range(block)) { |
| if (auto forOp = dyn_cast<ForOp>(&op)) { |
| if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal, |
| workGroupSizeVal))) { |
| return signalPassFailure(); |
| } |
| } |
| } |
| } |
| } |
| SmallVector<int64_t, 3> numWorkGroups; |
| SmallVector<int64_t, 3> workGroupSize; |
| }; |
| |
| } // namespace |
| |
| std::unique_ptr<OpPassBase<FuncOp>> |
| mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims, |
| unsigned numThreadDims) { |
| return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims); |
| } |
| |
| std::unique_ptr<OpPassBase<FuncOp>> |
| mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups, |
| ArrayRef<int64_t> workGroupSize) { |
| return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups, |
| workGroupSize); |
| } |
| |
| static PassRegistration<ForLoopMapper> |
| registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] { |
| return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(), |
| clNumThreadDims.getValue()); |
| }); |
| |
| static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU( |
| LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels", |
| [] { |
| SmallVector<int64_t, 3> numWorkGroups, workGroupSize; |
| numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end()); |
| workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end()); |
| return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups, |
| workGroupSize); |
| }); |