blob: 5885facd07541e476ab67abb146073e8e5b40d78 [file] [log] [blame]
//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This header file defines prototypes that expose pass constructors.
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
#include "Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"
#include <optional>
namespace llvm {
class TargetMachine;
class LLVMContext;
class Module;
} // namespace llvm
namespace mlir {
class TypeConverter;
class ConversionTarget;
namespace func {
class FuncOp;
} // namespace func
#define GEN_PASS_DECL
#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
/// Pass that moves ops which are likely an index computation into gpu.launch
/// body.
std::unique_ptr<Pass> createGpuLauchSinkIndexComputationsPass();
/// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
/// a separate kernel function.
std::unique_ptr<OperationPass<ModuleOp>>
createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef());
/// Rewrites a function region so that GPU ops execute asynchronously.
std::unique_ptr<OperationPass<func::FuncOp>> createGpuAsyncRegionPass();
/// Maps the parallel loops found in the given function to workgroups. The first
/// loop encountered will be mapped to the global workgroup and the second loop
/// encountered to the local workgroup. Within each mapping, the first three
/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
/// mapped to sequential loops.
std::unique_ptr<OperationPass<func::FuncOp>> createGpuMapParallelLoopsPass();
/// Collect a set of patterns to rewrite GlobalIdOp op within the GPU dialect.
void populateGpuGlobalIdPatterns(RewritePatternSet &patterns);
/// Collect a set of patterns to rewrite shuffle ops within the GPU dialect.
void populateGpuShufflePatterns(RewritePatternSet &patterns);
/// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
void populateGpuAllReducePatterns(RewritePatternSet &patterns);
/// Collect a set of patterns to break down subgroup_reduce ops into smaller
/// ones supported by the target of `size <= maxShuffleBitwidth`, where `size`
/// is the subgroup_reduce value bitwidth.
void populateGpuBreakDownSubgrupReducePatterns(RewritePatternSet &patterns,
unsigned maxShuffleBitwidth = 32,
PatternBenefit benefit = 1);
/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `gpu.shuffle`
/// ops over `shuffleBitwidth` scalar types. Assumes that the subgroup has
/// `subgroupSize` lanes. Uses the butterfly shuffle algorithm.
void populateGpuLowerSubgroupReduceToShufflePattenrs(
RewritePatternSet &patterns, unsigned subgroupSize,
unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
/// Collect all patterns to rewrite ops within the GPU dialect.
inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
populateGpuAllReducePatterns(patterns);
populateGpuGlobalIdPatterns(patterns);
populateGpuShufflePatterns(patterns);
}
namespace gpu {
/// Searches for all GPU modules in `op` and transforms them into GPU binary
/// operations. The resulting `gpu.binary` has `handler` as its offloading
/// handler attribute.
LogicalResult transformGpuModulesToBinaries(
Operation *op, OffloadingLLVMTranslationAttrInterface handler = nullptr,
const gpu::TargetOptions &options = {});
/// Base pass class to serialize kernel functions through LLVM into
/// user-specified IR and add the resulting blob as module attribute.
class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
public:
SerializeToBlobPass(TypeID passID);
SerializeToBlobPass(const SerializeToBlobPass &other);
void runOnOperation() final;
protected:
/// Hook allowing the application of optimizations before codegen
/// By default, does nothing
virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule,
llvm::TargetMachine &targetMachine);
/// Translates the 'getOperation()' result to an LLVM module.
virtual std::unique_ptr<llvm::Module>
translateToLLVMIR(llvm::LLVMContext &llvmContext);
private:
/// Creates the LLVM target machine to generate the ISA.
std::unique_ptr<llvm::TargetMachine> createTargetMachine();
/// Translates the module to ISA
std::optional<std::string> translateToISA(llvm::Module &llvmModule,
llvm::TargetMachine &targetMachine);
/// Serializes the target ISA to binary form.
virtual std::unique_ptr<std::vector<char>>
serializeISA(const std::string &isa) = 0;
protected:
Option<std::string> triple{*this, "triple",
::llvm::cl::desc("Target triple")};
Option<std::string> chip{*this, "chip",
::llvm::cl::desc("Target architecture")};
Option<std::string> features{*this, "features",
::llvm::cl::desc("Target features")};
Option<int> optLevel{*this, "opt-level",
llvm::cl::desc("Optimization level for compilation"),
llvm::cl::init(2)};
Option<std::string> gpuBinaryAnnotation{
*this, "gpu-binary-annotation",
llvm::cl::desc("Annotation attribute string for GPU binary"),
llvm::cl::init(getDefaultGpuBinaryAnnotation())};
Option<bool> dumpPtx{*this, "dump-ptx",
::llvm::cl::desc("Dump generated PTX"),
llvm::cl::init(false)};
};
} // namespace gpu
//===----------------------------------------------------------------------===//
// Registration
//===----------------------------------------------------------------------===//
/// Register pass to serialize GPU kernel functions to a CUBIN binary
/// annotation.
LLVM_DEPRECATED("use Target attributes instead", "")
void registerGpuSerializeToCubinPass();
/// Register pass to serialize GPU kernel functions to a HSAco binary
/// annotation.
LLVM_DEPRECATED("use Target attributes instead", "")
void registerGpuSerializeToHsacoPass();
/// Create an instance of the GPU kernel function to CUBIN binary serialization
/// pass with optLevel (default level 2).
LLVM_DEPRECATED("use Target attributes instead", "")
std::unique_ptr<Pass> createGpuSerializeToCubinPass(StringRef triple,
StringRef chip,
StringRef features,
int optLevel = 2,
bool dumpPtx = false);
/// Create an instance of the GPU kernel function to HSAco binary serialization
/// pass.
LLVM_DEPRECATED("use Target attributes instead", "")
std::unique_ptr<Pass> createGpuSerializeToHsacoPass(StringRef triple,
StringRef arch,
StringRef features,
int optLevel);
/// Collect a set of patterns to decompose memrefs ops.
void populateGpuDecomposeMemrefsPatterns(RewritePatternSet &patterns);
/// Pass decomposes memref ops inside `gpu.launch` body.
std::unique_ptr<Pass> createGpuDecomposeMemrefsPass();
/// Erase barriers that do not enforce conflicting memory side effects.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns);
/// Generate the code for registering passes.
#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
} // namespace mlir
#endif // MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_