mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp - third_party/llvm-project - Git at Google

 //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the GPU dialect kernel outlining pass.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/GPU/Transforms/Passes.h"

 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include <limits>

 namespace mlir {
 #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONS
 #define GEN_PASS_DEF_GPUKERNELOUTLINING
 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
 } // namespace mlir

 using namespace mlir;

 template <typename OpTy>
 static void createForAllDimensions(OpBuilder &builder, Location loc,
                                    SmallVectorImpl<Value> &values) {
   for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
     values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim));
 }

 /// Adds operations generating block/thread ids and grid/block dimensions at the
 /// beginning of the `launchFuncOpBody` region. Add mapping from argument in
 /// entry block of `launchOpBody`, to the corresponding result value of the
 /// added operations.
 static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
                                      Region &launchOpBody, IRMapping &map,
                                      bool hasCluster = false) {
   OpBuilder builder(loc->getContext());
   Block &firstBlock = launchOpBody.front();
   builder.setInsertionPointToStart(&launchFuncOpBody.front());
   SmallVector<Value> indexOps;
   // The order is important here, as it must match the order of the arguments
   createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
   createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
   if (hasCluster) {
     createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps);
     createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps);
   }
   // Replace the leading 12 function args with the respective thread/block index
   // operations. Iterate backwards since args are erased and indices change.
   for (const auto &indexOp : enumerate(indexOps))
     map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
 }

 /// Identifies operations that are beneficial to sink into kernels. These
 /// operations may not have side-effects, as otherwise sinking (and hence
 /// duplicating them) is not legal.
 static bool isLikelyAnIndexComputation(Operation *op) {
   return matchPattern(op, m_Constant()) ||
          isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
 }

 /// For a given operation `op`, computes whether it is beneficial to sink the
 /// operation into the kernel. An operation can be sunk if doing so does not
 /// introduce new kernel arguments. Whether a value is already available in the
 /// kernel (and hence does not introduce new arguments) is checked by
 /// querying `existingDependencies` and `availableValues`.
 /// If an operand is not yet available, we recursively check whether it can be
 /// made available by siking its defining op.
 /// Operations that are indentified for sinking are added to `beneficiaryOps` in
 /// the order they should appear in the kernel. Furthermore, `availableValues`
 /// is updated with results that will be available after sinking the identified
 /// ops.
 static bool extractBeneficiaryOps(
     Operation *op, const SetVector<Value> &existingDependencies,
     SetVector<Operation *> &beneficiaryOps,
     llvm::SmallPtrSetImpl<Value> &availableValues,
     llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
   if (beneficiaryOps.count(op))
     return true;

   if (!isSinkingBeneficiary(op))
     return false;

   for (Value operand : op->getOperands()) {
     // It is already visible in the kernel, keep going.
     if (availableValues.count(operand))
       continue;
     // Else check whether it can be made available via sinking or already is a
     // dependency.
     Operation *definingOp = operand.getDefiningOp();
     if ((!definingOp || !extractBeneficiaryOps(definingOp, existingDependencies,
                                                beneficiaryOps, availableValues,
                                                isSinkingBeneficiary)) &&
         !existingDependencies.count(operand))
       return false;
   }
   // We will sink the operation, mark its results as now available.
   beneficiaryOps.insert(op);
   for (Value result : op->getResults())
     availableValues.insert(result);
   return true;
 }

 LogicalResult mlir::sinkOperationsIntoLaunchOp(
     gpu::LaunchOp launchOp,
     llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
   assert(isSinkingBeneficiary);
   Region &launchOpBody = launchOp.getBody();

   // Identify uses from values defined outside of the scope of the launch
   // operation.
   SetVector<Value> sinkCandidates;
   getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);

   SetVector<Operation *> toBeSunk;
   llvm::SmallPtrSet<Value, 4> availableValues;
   for (Value operand : sinkCandidates) {
     Operation *operandOp = operand.getDefiningOp();
     if (!operandOp)
       continue;
     extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues,
                           isSinkingBeneficiary);
   }

   // Insert operations so that the defs get cloned before uses.
   IRMapping map;
   OpBuilder builder(launchOpBody);
   for (Operation *op : toBeSunk) {
     Operation *clonedOp = builder.clone(*op, map);
     // Only replace uses within the launch op.
     for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
       replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
                                  launchOp.getBody());
   }
   return success();
 }

 /// Return the provided KernelDim3 as an array of i32 constants if possible.
 static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims) {
   SmallVector<int32_t, 3> constants;
   MLIRContext *ctx = dims.x.getContext();
   for (Value v : {dims.x, dims.y, dims.z}) {
     APInt constValue;
     if (!matchPattern(v, m_ConstantInt(&constValue)))
       return nullptr;
     // In the event someone called for a too-large block or grid dimension,
     // don't set bounds as it is likely to cause more confusing behavior.
     if (constValue.ugt(std::numeric_limits<uint32_t>::max()))
       return nullptr;
     constants.push_back(
         constValue.getLimitedValue(std::numeric_limits<uint32_t>::max()));
   }
   return DenseI32ArrayAttr::get(ctx, constants);
 }

 /// Outline the `gpu.launch` operation body into a kernel function. Replace
 /// `gpu.terminator` operations by `gpu.return` in the generated function.
 /// Set block and grid size bounds if known.
 static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
                                             StringRef kernelFnName,
                                             SetVector<Value> &operands) {
   Location loc = launchOp.getLoc();
   // Create a builder with no insertion point, insertion will happen separately
   // due to symbol table manipulation.
   OpBuilder builder(launchOp.getContext());
   Region &launchOpBody = launchOp.getBody();

   // Identify uses from values defined outside of the scope of the launch
   // operation.
   getUsedValuesDefinedAbove(launchOpBody, operands);

   // Create the gpu.func operation.
   SmallVector<Type, 4> kernelOperandTypes;
   kernelOperandTypes.reserve(operands.size());
   for (Value operand : operands) {
     kernelOperandTypes.push_back(operand.getType());
   }
   FunctionType type =
       FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
   auto outlinedFunc = builder.create<gpu::GPUFuncOp>(
       loc, kernelFnName, type,
       TypeRange(ValueRange(launchOp.getWorkgroupAttributions())),
       TypeRange(ValueRange(launchOp.getPrivateAttributions())));
   outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                         builder.getUnitAttr());

   // If we can infer bounds on the grid and/or block sizes from the arguments
   // to the launch op, propagate them to the generated kernel. This is safe
   // because multiple launches with the same body are not deduplicated.
   if (auto blockBounds =
           maybeConstantDimsAttr(launchOp.getBlockSizeOperandValues()))
     outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName(),
                           blockBounds);
   if (auto gridBounds =
           maybeConstantDimsAttr(launchOp.getGridSizeOperandValues()))
     outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownGridSizeAttrName(),
                           gridBounds);

   IRMapping map;

   // Map the arguments corresponding to the launch parameters like blockIdx,
   // threadIdx, etc. If cluster is present, then we also generate clusterIdx and
   // clusterDim.
   Region &outlinedFuncBody = outlinedFunc.getBody();
   injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map,
                            launchOp.hasClusterSize());

   // Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
   for (const auto &[launchArg, funcArg] :
        llvm::zip(launchOp.getWorkgroupAttributions(),
                  outlinedFunc.getWorkgroupAttributions()))
     map.map(launchArg, funcArg);
   for (const auto &[launchArg, funcArg] :
        llvm::zip(launchOp.getPrivateAttributions(),
                  outlinedFunc.getPrivateAttributions()))
     map.map(launchArg, funcArg);

   // Map arguments from gpu.launch region to the arguments of the gpu.func
   // operation.
   Block &entryBlock = outlinedFuncBody.front();
   for (const auto &operand : enumerate(operands))
     map.map(operand.value(), entryBlock.getArgument(operand.index()));

   // Clone the region of the gpu.launch operation into the gpu.func operation.
   // TODO: If cloneInto can be modified such that if a mapping for
   // a block exists, that block will be used to clone operations into (at the
   // end of the block), instead of creating a new block, this would be much
   // cleaner.
   launchOpBody.cloneInto(&outlinedFuncBody, map);

   // Branch from entry of the gpu.func operation to the block that is cloned
   // from the entry block of the gpu.launch operation.
   Block &launchOpEntry = launchOpBody.front();
   Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
   builder.setInsertionPointToEnd(&entryBlock);
   builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);

   outlinedFunc.walk([](gpu::TerminatorOp op) {
     OpBuilder replacer(op);
     replacer.create<gpu::ReturnOp>(op.getLoc());
     op.erase();
   });
   return outlinedFunc;
 }

 gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
                                        StringRef kernelFnName,
                                        llvm::SmallVectorImpl<Value> &operands) {
   DenseSet<Value> inputOperandSet;
   inputOperandSet.insert(operands.begin(), operands.end());
   SetVector<Value> operandSet(operands.begin(), operands.end());
   auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet);
   for (auto operand : operandSet) {
     if (!inputOperandSet.count(operand))
       operands.push_back(operand);
   }
   return funcOp;
 }

 /// Replace `gpu.launch` operations with an `gpu.launch_func` operation
 /// launching `kernelFunc`. The kernel func contains the body of the
 /// `gpu.launch` with constant region arguments inlined.
 static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
                                   gpu::GPUFuncOp kernelFunc,
                                   ValueRange operands) {
   OpBuilder builder(launchOp);
   // The launch op has an optional dynamic shared memory size. If it doesn't
   // exist, we use zero.
   Value asyncToken = launchOp.getAsyncToken();
   std::optional<gpu::KernelDim3> clusterSize =
       launchOp.getClusterSizeOperandValues();
   auto launchFunc = builder.create<gpu::LaunchFuncOp>(
       launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
       launchOp.getBlockSizeOperandValues(),
       launchOp.getDynamicSharedMemorySize(), operands,
       asyncToken ? asyncToken.getType() : nullptr,
       launchOp.getAsyncDependencies(), clusterSize);
   launchOp.replaceAllUsesWith(launchFunc);
   launchOp.erase();
 }

 namespace {
 /// Pass that moves ops which are likely an index computation into gpu.launch
 /// body.
 class GpuLaunchSinkIndexComputationsPass
     : public impl::GpuLaunchSinkIndexComputationsBase<
           GpuLaunchSinkIndexComputationsPass> {
 public:
   void runOnOperation() override {
     Operation *op = getOperation();
     if (op->walk([](gpu::LaunchOp launch) {
             // Pull in instructions that can be sunk
             if (failed(sinkOperationsIntoLaunchOp(launch,
                                                   isLikelyAnIndexComputation)))
               return WalkResult::interrupt();

             return WalkResult::advance();
           }).wasInterrupted())
       signalPassFailure();
   }
 };

 /// Pass that moves the kernel of each LaunchOp into its separate nested module.
 ///
 /// This pass moves the kernel code of each LaunchOp into a function created
 /// inside a nested module. It also creates an external function of the same
 /// name in the parent module.
 ///
 /// The gpu.modules are intended to be compiled to a cubin blob independently in
 /// a separate pass. The external functions can then be annotated with the
 /// symbol of the cubin accessor function.
 class GpuKernelOutliningPass
     : public impl::GpuKernelOutliningBase<GpuKernelOutliningPass> {
 public:
   GpuKernelOutliningPass(StringRef dlStr) {
     if (!dlStr.empty() && !dataLayoutStr.hasValue())
       dataLayoutStr = dlStr.str();
   }

   GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
       : GpuKernelOutliningBase(other), dataLayoutSpec(other.dataLayoutSpec) {
     dataLayoutStr = other.dataLayoutStr.getValue();
   }

   LogicalResult initialize(MLIRContext *context) override {
     // Initialize the data layout specification from the data layout string.
     if (!dataLayoutStr.empty()) {
       Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context);
       if (!resultAttr)
         return failure();

       dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(resultAttr);
       if (!dataLayoutSpec)
         return failure();
     }

     return success();
   }

   void runOnOperation() override {
     SymbolTable symbolTable(getOperation());
     bool modified = false;
     for (auto func : getOperation().getOps<SymbolOpInterface>()) {
       // Insert just after the function.
       Block::iterator insertPt(func->getNextNode());
       auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
         SetVector<Value> operands;
         std::string kernelFnName =
             Twine(op->getParentOfType<SymbolOpInterface>().getName(), "_kernel")
                 .str();

         gpu::GPUFuncOp outlinedFunc =
             outlineKernelFuncImpl(op, kernelFnName, operands);

         // Create nested module and insert outlinedFunc. The module will
         // originally get the same name as the function, but may be renamed on
         // insertion into the parent module.
         auto kernelModule = createKernelModule(outlinedFunc, symbolTable);
         symbolTable.insert(kernelModule, insertPt);

         // Potentially changes signature, pulling in constants.
         convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef());
         modified = true;
         return WalkResult::advance();
       });
       if (funcWalkResult.wasInterrupted())
         return signalPassFailure();
     }

     // If any new module was inserted in this module, annotate this module as
     // a container module.
     if (modified)
       getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
                               UnitAttr::get(&getContext()));
   }

 private:
   /// Returns a gpu.module containing kernelFunc and all callees (recursive).
   gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
                                       const SymbolTable &parentSymbolTable) {
     // TODO: This code cannot use an OpBuilder because it must be inserted into
     // a SymbolTable by the caller. SymbolTable needs to be refactored to
     // prevent manual building of Ops with symbols in code using SymbolTables
     // and then this needs to use the OpBuilder.
     auto *context = getOperation().getContext();
     OpBuilder builder(context);
     auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
                                                          kernelFunc.getName());

     // If a valid data layout spec was provided, attach it to the kernel module.
     // Otherwise, the default data layout will be used.
     if (dataLayoutSpec)
       kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);

     SymbolTable symbolTable(kernelModule);
     symbolTable.insert(kernelFunc);

     SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc};
     while (!symbolDefWorklist.empty()) {
       if (std::optional<SymbolTable::UseRange> symbolUses =
               SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) {
         for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
           StringRef symbolName =
               cast<FlatSymbolRefAttr>(symbolUse.getSymbolRef()).getValue();
           if (symbolTable.lookup(symbolName))
             continue;

           Operation *symbolDefClone =
               parentSymbolTable.lookup(symbolName)->clone();
           symbolDefWorklist.push_back(symbolDefClone);
           symbolTable.insert(symbolDefClone);
         }
       }
     }

     return kernelModule;
   }

   Option<std::string> dataLayoutStr{
       *this, "data-layout-str",
       llvm::cl::desc("String containing the data layout specification to be "
                      "attached to the GPU kernel module")};

   DataLayoutSpecInterface dataLayoutSpec;
 };

 } // namespace

 std::unique_ptr<Pass> mlir::createGpuLauchSinkIndexComputationsPass() {
   return std::make_unique<GpuLaunchSinkIndexComputationsPass>();
 }

 std::unique_ptr<OperationPass<ModuleOp>>
 mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
   return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
 }
	//===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the GPU dialect kernel outlining pass.
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/GPU/Transforms/Passes.h"

	#include "mlir/AsmParser/AsmParser.h"
	#include "mlir/Dialect/Arith/IR/Arith.h"
	#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
	#include "mlir/Dialect/DLTI/DLTI.h"
	#include "mlir/Dialect/Func/IR/FuncOps.h"
	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Dialect/GPU/Transforms/Utils.h"
	#include "mlir/Dialect/MemRef/IR/MemRef.h"
	#include "mlir/IR/Builders.h"
	#include "mlir/IR/BuiltinAttributes.h"
	#include "mlir/IR/IRMapping.h"
	#include "mlir/IR/Matchers.h"
	#include "mlir/IR/SymbolTable.h"
	#include "mlir/Support/LLVM.h"
	#include "mlir/Transforms/RegionUtils.h"
	#include <limits>

	namespace mlir {
	#define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONS
	#define GEN_PASS_DEF_GPUKERNELOUTLINING
	#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
	} // namespace mlir

	using namespace mlir;

	template <typename OpTy>
	static void createForAllDimensions(OpBuilder &builder, Location loc,
	SmallVectorImpl<Value> &values) {
	for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
	values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim));
	}

	/// Adds operations generating block/thread ids and grid/block dimensions at the
	/// beginning of the `launchFuncOpBody` region. Add mapping from argument in
	/// entry block of `launchOpBody`, to the corresponding result value of the
	/// added operations.
	static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody,
	Region &launchOpBody, IRMapping &map,
	bool hasCluster = false) {
	OpBuilder builder(loc->getContext());
	Block &firstBlock = launchOpBody.front();
	builder.setInsertionPointToStart(&launchFuncOpBody.front());
	SmallVector<Value> indexOps;
	// The order is important here, as it must match the order of the arguments
	createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps);
	createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps);
	createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps);
	createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps);
	if (hasCluster) {
	createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps);
	createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps);
	}
	// Replace the leading 12 function args with the respective thread/block index
	// operations. Iterate backwards since args are erased and indices change.
	for (const auto &indexOp : enumerate(indexOps))
	map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
	}

	/// Identifies operations that are beneficial to sink into kernels. These
	/// operations may not have side-effects, as otherwise sinking (and hence
	/// duplicating them) is not legal.
	static bool isLikelyAnIndexComputation(Operation *op) {
	return matchPattern(op, m_Constant()) \|\|
	isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
	}

	/// For a given operation `op`, computes whether it is beneficial to sink the
	/// operation into the kernel. An operation can be sunk if doing so does not
	/// introduce new kernel arguments. Whether a value is already available in the
	/// kernel (and hence does not introduce new arguments) is checked by
	/// querying `existingDependencies` and `availableValues`.
	/// If an operand is not yet available, we recursively check whether it can be
	/// made available by siking its defining op.
	/// Operations that are indentified for sinking are added to `beneficiaryOps` in
	/// the order they should appear in the kernel. Furthermore, `availableValues`
	/// is updated with results that will be available after sinking the identified
	/// ops.
	static bool extractBeneficiaryOps(
	Operation *op, const SetVector<Value> &existingDependencies,
	SetVector<Operation *> &beneficiaryOps,
	llvm::SmallPtrSetImpl<Value> &availableValues,
	llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
	if (beneficiaryOps.count(op))
	return true;

	if (!isSinkingBeneficiary(op))
	return false;

	for (Value operand : op->getOperands()) {
	// It is already visible in the kernel, keep going.
	if (availableValues.count(operand))
	continue;
	// Else check whether it can be made available via sinking or already is a
	// dependency.
	Operation *definingOp = operand.getDefiningOp();
	if ((!definingOp \|\| !extractBeneficiaryOps(definingOp, existingDependencies,
	beneficiaryOps, availableValues,
	isSinkingBeneficiary)) &&
	!existingDependencies.count(operand))
	return false;
	}
	// We will sink the operation, mark its results as now available.
	beneficiaryOps.insert(op);
	for (Value result : op->getResults())
	availableValues.insert(result);
	return true;
	}

	LogicalResult mlir::sinkOperationsIntoLaunchOp(
	gpu::LaunchOp launchOp,
	llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) {
	assert(isSinkingBeneficiary);
	Region &launchOpBody = launchOp.getBody();

	// Identify uses from values defined outside of the scope of the launch
	// operation.
	SetVector<Value> sinkCandidates;
	getUsedValuesDefinedAbove(launchOpBody, sinkCandidates);

	SetVector<Operation *> toBeSunk;
	llvm::SmallPtrSet<Value, 4> availableValues;
	for (Value operand : sinkCandidates) {
	Operation *operandOp = operand.getDefiningOp();
	if (!operandOp)
	continue;
	extractBeneficiaryOps(operandOp, sinkCandidates, toBeSunk, availableValues,
	isSinkingBeneficiary);
	}

	// Insert operations so that the defs get cloned before uses.
	IRMapping map;
	OpBuilder builder(launchOpBody);
	for (Operation *op : toBeSunk) {
	Operation clonedOp = builder.clone(op, map);
	// Only replace uses within the launch op.
	for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults()))
	replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair),
	launchOp.getBody());
	}
	return success();
	}

	/// Return the provided KernelDim3 as an array of i32 constants if possible.
	static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims) {
	SmallVector<int32_t, 3> constants;
	MLIRContext *ctx = dims.x.getContext();
	for (Value v : {dims.x, dims.y, dims.z}) {
	APInt constValue;
	if (!matchPattern(v, m_ConstantInt(&constValue)))
	return nullptr;
	// In the event someone called for a too-large block or grid dimension,
	// don't set bounds as it is likely to cause more confusing behavior.
	if (constValue.ugt(std::numeric_limits<uint32_t>::max()))
	return nullptr;
	constants.push_back(
	constValue.getLimitedValue(std::numeric_limits<uint32_t>::max()));
	}
	return DenseI32ArrayAttr::get(ctx, constants);
	}

	/// Outline the `gpu.launch` operation body into a kernel function. Replace
	/// `gpu.terminator` operations by `gpu.return` in the generated function.
	/// Set block and grid size bounds if known.
	static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
	StringRef kernelFnName,
	SetVector<Value> &operands) {
	Location loc = launchOp.getLoc();
	// Create a builder with no insertion point, insertion will happen separately
	// due to symbol table manipulation.
	OpBuilder builder(launchOp.getContext());
	Region &launchOpBody = launchOp.getBody();

	// Identify uses from values defined outside of the scope of the launch
	// operation.
	getUsedValuesDefinedAbove(launchOpBody, operands);

	// Create the gpu.func operation.
	SmallVector<Type, 4> kernelOperandTypes;
	kernelOperandTypes.reserve(operands.size());
	for (Value operand : operands) {
	kernelOperandTypes.push_back(operand.getType());
	}
	FunctionType type =
	FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
	auto outlinedFunc = builder.create<gpu::GPUFuncOp>(
	loc, kernelFnName, type,
	TypeRange(ValueRange(launchOp.getWorkgroupAttributions())),
	TypeRange(ValueRange(launchOp.getPrivateAttributions())));
	outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
	builder.getUnitAttr());

	// If we can infer bounds on the grid and/or block sizes from the arguments
	// to the launch op, propagate them to the generated kernel. This is safe
	// because multiple launches with the same body are not deduplicated.
	if (auto blockBounds =
	maybeConstantDimsAttr(launchOp.getBlockSizeOperandValues()))
	outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName(),
	blockBounds);
	if (auto gridBounds =
	maybeConstantDimsAttr(launchOp.getGridSizeOperandValues()))
	outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownGridSizeAttrName(),
	gridBounds);

	IRMapping map;

	// Map the arguments corresponding to the launch parameters like blockIdx,
	// threadIdx, etc. If cluster is present, then we also generate clusterIdx and
	// clusterDim.
	Region &outlinedFuncBody = outlinedFunc.getBody();
	injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map,
	launchOp.hasClusterSize());

	// Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
	for (const auto &[launchArg, funcArg] :
	llvm::zip(launchOp.getWorkgroupAttributions(),
	outlinedFunc.getWorkgroupAttributions()))
	map.map(launchArg, funcArg);
	for (const auto &[launchArg, funcArg] :
	llvm::zip(launchOp.getPrivateAttributions(),
	outlinedFunc.getPrivateAttributions()))
	map.map(launchArg, funcArg);

	// Map arguments from gpu.launch region to the arguments of the gpu.func
	// operation.
	Block &entryBlock = outlinedFuncBody.front();
	for (const auto &operand : enumerate(operands))
	map.map(operand.value(), entryBlock.getArgument(operand.index()));

	// Clone the region of the gpu.launch operation into the gpu.func operation.
	// TODO: If cloneInto can be modified such that if a mapping for
	// a block exists, that block will be used to clone operations into (at the
	// end of the block), instead of creating a new block, this would be much
	// cleaner.
	launchOpBody.cloneInto(&outlinedFuncBody, map);

	// Branch from entry of the gpu.func operation to the block that is cloned
	// from the entry block of the gpu.launch operation.
	Block &launchOpEntry = launchOpBody.front();
	Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
	builder.setInsertionPointToEnd(&entryBlock);
	builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);

	outlinedFunc.walk([](gpu::TerminatorOp op) {
	OpBuilder replacer(op);
	replacer.create<gpu::ReturnOp>(op.getLoc());
	op.erase();
	});
	return outlinedFunc;
	}

	gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp,
	StringRef kernelFnName,
	llvm::SmallVectorImpl<Value> &operands) {
	DenseSet<Value> inputOperandSet;
	inputOperandSet.insert(operands.begin(), operands.end());
	SetVector<Value> operandSet(operands.begin(), operands.end());
	auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet);
	for (auto operand : operandSet) {
	if (!inputOperandSet.count(operand))
	operands.push_back(operand);
	}
	return funcOp;
	}

	/// Replace `gpu.launch` operations with an `gpu.launch_func` operation
	/// launching `kernelFunc`. The kernel func contains the body of the
	/// `gpu.launch` with constant region arguments inlined.
	static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
	gpu::GPUFuncOp kernelFunc,
	ValueRange operands) {
	OpBuilder builder(launchOp);
	// The launch op has an optional dynamic shared memory size. If it doesn't
	// exist, we use zero.
	Value asyncToken = launchOp.getAsyncToken();
	std::optional<gpu::KernelDim3> clusterSize =
	launchOp.getClusterSizeOperandValues();
	auto launchFunc = builder.create<gpu::LaunchFuncOp>(
	launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
	launchOp.getBlockSizeOperandValues(),
	launchOp.getDynamicSharedMemorySize(), operands,
	asyncToken ? asyncToken.getType() : nullptr,
	launchOp.getAsyncDependencies(), clusterSize);
	launchOp.replaceAllUsesWith(launchFunc);
	launchOp.erase();
	}

	namespace {
	/// Pass that moves ops which are likely an index computation into gpu.launch
	/// body.
	class GpuLaunchSinkIndexComputationsPass
	: public impl::GpuLaunchSinkIndexComputationsBase<
	GpuLaunchSinkIndexComputationsPass> {
	public:
	void runOnOperation() override {
	Operation *op = getOperation();
	if (op->walk([](gpu::LaunchOp launch) {
	// Pull in instructions that can be sunk
	if (failed(sinkOperationsIntoLaunchOp(launch,
	isLikelyAnIndexComputation)))
	return WalkResult::interrupt();

	return WalkResult::advance();
	}).wasInterrupted())
	signalPassFailure();
	}
	};

	/// Pass that moves the kernel of each LaunchOp into its separate nested module.
	///
	/// This pass moves the kernel code of each LaunchOp into a function created
	/// inside a nested module. It also creates an external function of the same
	/// name in the parent module.
	///
	/// The gpu.modules are intended to be compiled to a cubin blob independently in
	/// a separate pass. The external functions can then be annotated with the
	/// symbol of the cubin accessor function.
	class GpuKernelOutliningPass
	: public impl::GpuKernelOutliningBase<GpuKernelOutliningPass> {
	public:
	GpuKernelOutliningPass(StringRef dlStr) {
	if (!dlStr.empty() && !dataLayoutStr.hasValue())
	dataLayoutStr = dlStr.str();
	}

	GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
	: GpuKernelOutliningBase(other), dataLayoutSpec(other.dataLayoutSpec) {
	dataLayoutStr = other.dataLayoutStr.getValue();
	}

	LogicalResult initialize(MLIRContext *context) override {
	// Initialize the data layout specification from the data layout string.
	if (!dataLayoutStr.empty()) {
	Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context);
	if (!resultAttr)
	return failure();

	dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(resultAttr);
	if (!dataLayoutSpec)
	return failure();
	}

	return success();
	}

	void runOnOperation() override {
	SymbolTable symbolTable(getOperation());
	bool modified = false;
	for (auto func : getOperation().getOps<SymbolOpInterface>()) {
	// Insert just after the function.
	Block::iterator insertPt(func->getNextNode());
	auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
	SetVector<Value> operands;
	std::string kernelFnName =
	Twine(op->getParentOfType<SymbolOpInterface>().getName(), "_kernel")
	.str();

	gpu::GPUFuncOp outlinedFunc =
	outlineKernelFuncImpl(op, kernelFnName, operands);

	// Create nested module and insert outlinedFunc. The module will
	// originally get the same name as the function, but may be renamed on
	// insertion into the parent module.
	auto kernelModule = createKernelModule(outlinedFunc, symbolTable);
	symbolTable.insert(kernelModule, insertPt);

	// Potentially changes signature, pulling in constants.
	convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef());
	modified = true;
	return WalkResult::advance();
	});
	if (funcWalkResult.wasInterrupted())
	return signalPassFailure();
	}

	// If any new module was inserted in this module, annotate this module as
	// a container module.
	if (modified)
	getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
	UnitAttr::get(&getContext()));
	}

	private:
	/// Returns a gpu.module containing kernelFunc and all callees (recursive).
	gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc,
	const SymbolTable &parentSymbolTable) {
	// TODO: This code cannot use an OpBuilder because it must be inserted into
	// a SymbolTable by the caller. SymbolTable needs to be refactored to
	// prevent manual building of Ops with symbols in code using SymbolTables
	// and then this needs to use the OpBuilder.
	auto *context = getOperation().getContext();
	OpBuilder builder(context);
	auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
	kernelFunc.getName());

	// If a valid data layout spec was provided, attach it to the kernel module.
	// Otherwise, the default data layout will be used.
	if (dataLayoutSpec)
	kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);

	SymbolTable symbolTable(kernelModule);
	symbolTable.insert(kernelFunc);

	SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc};
	while (!symbolDefWorklist.empty()) {
	if (std::optional<SymbolTable::UseRange> symbolUses =
	SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) {
	for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
	StringRef symbolName =
	cast<FlatSymbolRefAttr>(symbolUse.getSymbolRef()).getValue();
	if (symbolTable.lookup(symbolName))
	continue;

	Operation *symbolDefClone =
	parentSymbolTable.lookup(symbolName)->clone();
	symbolDefWorklist.push_back(symbolDefClone);
	symbolTable.insert(symbolDefClone);
	}
	}
	}

	return kernelModule;
	}

	Option<std::string> dataLayoutStr{
	*this, "data-layout-str",
	llvm::cl::desc("String containing the data layout specification to be "
	"attached to the GPU kernel module")};

	DataLayoutSpecInterface dataLayoutSpec;
	};

	} // namespace

	std::unique_ptr<Pass> mlir::createGpuLauchSinkIndexComputationsPass() {
	return std::make_unique<GpuLaunchSinkIndexComputationsPass>();
	}

	std::unique_ptr<OperationPass<ModuleOp>>
	mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
	return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
	}