mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h - third_party/llvm-project - Git at Google

 //===- OpToFuncCallLowering.h - GPU ops lowering to custom calls *- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #ifndef MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
 #define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_

 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Builders.h"

 namespace mlir {

 namespace {
 /// Detection trait tor the `getFastmath` instance method.
 template <typename T>
 using has_get_fastmath_t = decltype(std::declval<T>().getFastmath());
 } // namespace

 /// Rewriting that replaces SourceOp with a CallOp to `f32Func` or `f64Func` or
 /// `f32ApproxFunc` or `f16Func` or `i32Type` depending on the element type and
 /// the fastMathFlag of that Op, if present. The function declaration is added
 /// in case it was not added before.
 ///
 /// If the input values are of bf16 type (or f16 type if f16Func is empty), the
 /// value is first casted to f32, the function called and then the result casted
 /// back.
 ///
 /// Example with NVVM:
 ///   %exp_f32 = math.exp %arg_f32 : f32
 ///
 /// will be transformed into
 ///   llvm.call @__nv_expf(%arg_f32) : (f32) -> f32
 ///
 /// If the fastMathFlag attribute of SourceOp is `afn` or `fast`, this Op lowers
 /// to the approximate calculation function.
 ///
 /// Also example with NVVM:
 ///   %exp_f32 = math.exp %arg_f32 fastmath<afn> : f32
 ///
 /// will be transformed into
 ///   llvm.call @__nv_fast_expf(%arg_f32) : (f32) -> f32
 ///
 /// Final example with NVVM:
 ///   %pow_f32 = math.fpowi %arg_f32, %arg_i32
 ///
 /// will be transformed into
 ///   llvm.call @__nv_powif(%arg_f32, %arg_i32) : (f32, i32) -> f32
 template <typename SourceOp>
 struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
 public:
   explicit OpToFuncCallLowering(const LLVMTypeConverter &lowering,
                                 StringRef f32Func, StringRef f64Func,
                                 StringRef f32ApproxFunc, StringRef f16Func,
                                 StringRef i32Func = "",
                                 PatternBenefit benefit = 1)
       : ConvertOpToLLVMPattern<SourceOp>(lowering, benefit), f32Func(f32Func),
         f64Func(f64Func), f32ApproxFunc(f32ApproxFunc), f16Func(f16Func),
         i32Func(i32Func) {}

   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     using LLVM::LLVMFuncOp;

     static_assert(
         std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
         "expected single result op");

     bool isResultBool = op->getResultTypes().front().isInteger(1);
     if constexpr (!std::is_base_of<OpTrait::SameOperandsAndResultType<SourceOp>,
                                    SourceOp>::value) {
       assert(op->getNumOperands() > 0 &&
              "expected op to take at least one operand");
       assert((op->getResultTypes().front() == op->getOperand(0).getType() ||
               isResultBool) &&
              "expected op with same operand and result types");
     }

     if (!op->template getParentOfType<FunctionOpInterface>()) {
       return rewriter.notifyMatchFailure(
           op, "expected op to be within a function region");
     }

     SmallVector<Value, 1> castedOperands;
     for (Value operand : adaptor.getOperands())
       castedOperands.push_back(maybeCast(operand, rewriter));

     Type castedOperandType = castedOperands.front().getType();

     // At ABI level, booleans are treated as i32.
     Type resultType =
         isResultBool ? rewriter.getIntegerType(32) : castedOperandType;
     Type funcType = getFunctionType(resultType, castedOperands);
     StringRef funcName = getFunctionName(castedOperandType, op);
     if (funcName.empty())
       return failure();

     LLVMFuncOp funcOp = appendOrGetFuncOp(funcName, funcType, op);
     auto callOp =
         rewriter.create<LLVM::CallOp>(op->getLoc(), funcOp, castedOperands);

     if (resultType == adaptor.getOperands().front().getType()) {
       rewriter.replaceOp(op, {callOp.getResult()});
       return success();
     }

     // Boolean result are mapping to i32 at the ABI level with zero values being
     // interpreted as false and non-zero values being interpreted as true. Since
     // there is no guarantee of a specific value being used to indicate true,
     // compare for inequality with zero (rather than truncate or shift).
     if (isResultBool) {
       Value zero = rewriter.create<LLVM::ConstantOp>(
           op->getLoc(), rewriter.getIntegerType(32),
           rewriter.getI32IntegerAttr(0));
       Value truncated = rewriter.create<LLVM::ICmpOp>(
           op->getLoc(), LLVM::ICmpPredicate::ne, callOp.getResult(), zero);
       rewriter.replaceOp(op, {truncated});
       return success();
     }

     assert(callOp.getResult().getType().isF32() &&
            "only f32 types are supposed to be truncated back");
     Value truncated = rewriter.create<LLVM::FPTruncOp>(
         op->getLoc(), adaptor.getOperands().front().getType(),
         callOp.getResult());
     rewriter.replaceOp(op, {truncated});
     return success();
   }

   Value maybeCast(Value operand, PatternRewriter &rewriter) const {
     Type type = operand.getType();
     if (!isa<Float16Type, BFloat16Type>(type))
       return operand;

     // If there's an f16 function, no need to cast f16 values.
     if (!f16Func.empty() && isa<Float16Type>(type))
       return operand;

     return rewriter.create<LLVM::FPExtOp>(
         operand.getLoc(), Float32Type::get(rewriter.getContext()), operand);
   }

   Type getFunctionType(Type resultType, ValueRange operands) const {
     SmallVector<Type> operandTypes(operands.getTypes());
     return LLVM::LLVMFunctionType::get(resultType, operandTypes);
   }

   LLVM::LLVMFuncOp appendOrGetFuncOp(StringRef funcName, Type funcType,
                                      Operation *op) const {
     using LLVM::LLVMFuncOp;

     auto funcAttr = StringAttr::get(op->getContext(), funcName);
     auto funcOp =
         SymbolTable::lookupNearestSymbolFrom<LLVMFuncOp>(op, funcAttr);
     if (funcOp)
       return funcOp;

     auto parentFunc = op->getParentOfType<FunctionOpInterface>();
     assert(parentFunc && "expected there to be a parent function");
     OpBuilder b(parentFunc);
     return b.create<LLVMFuncOp>(op->getLoc(), funcName, funcType);
   }

   StringRef getFunctionName(Type type, SourceOp op) const {
     bool useApprox = false;
     if constexpr (llvm::is_detected<has_get_fastmath_t, SourceOp>::value) {
       arith::FastMathFlags flag = op.getFastmath();
       useApprox = ((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) &&
                   !f32ApproxFunc.empty();
     }

     if (isa<Float16Type>(type))
       return f16Func;
     if (isa<Float32Type>(type)) {
       if (useApprox)
         return f32ApproxFunc;
       return f32Func;
     }
     if (isa<Float64Type>(type))
       return f64Func;

     if (type.isInteger(32))
       return i32Func;
     return "";
   }

   const std::string f32Func;
   const std::string f64Func;
   const std::string f32ApproxFunc;
   const std::string f16Func;
   const std::string i32Func;
 };

 } // namespace mlir

 #endif // MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
	//===- OpToFuncCallLowering.h - GPU ops lowering to custom calls - C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	#ifndef MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_
	#define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_

	#include "mlir/Conversion/LLVMCommon/Pattern.h"
	#include "mlir/Dialect/Arith/IR/Arith.h"
	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
	#include "mlir/IR/Builders.h"

	namespace mlir {

	namespace {
	/// Detection trait tor the `getFastmath` instance method.
	template <typename T>
	using has_get_fastmath_t = decltype(std::declval<T>().getFastmath());
	} // namespace

	/// Rewriting that replaces SourceOp with a CallOp to `f32Func` or `f64Func` or
	/// `f32ApproxFunc` or `f16Func` or `i32Type` depending on the element type and
	/// the fastMathFlag of that Op, if present. The function declaration is added
	/// in case it was not added before.
	///
	/// If the input values are of bf16 type (or f16 type if f16Func is empty), the
	/// value is first casted to f32, the function called and then the result casted
	/// back.
	///
	/// Example with NVVM:
	/// %exp_f32 = math.exp %arg_f32 : f32
	///
	/// will be transformed into
	/// llvm.call @__nv_expf(%arg_f32) : (f32) -> f32
	///
	/// If the fastMathFlag attribute of SourceOp is `afn` or `fast`, this Op lowers
	/// to the approximate calculation function.
	///
	/// Also example with NVVM:
	/// %exp_f32 = math.exp %arg_f32 fastmath<afn> : f32
	///
	/// will be transformed into
	/// llvm.call @__nv_fast_expf(%arg_f32) : (f32) -> f32
	///
	/// Final example with NVVM:
	/// %pow_f32 = math.fpowi %arg_f32, %arg_i32
	///
	/// will be transformed into
	/// llvm.call @__nv_powif(%arg_f32, %arg_i32) : (f32, i32) -> f32
	template <typename SourceOp>
	struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
	public:
	explicit OpToFuncCallLowering(const LLVMTypeConverter &lowering,
	StringRef f32Func, StringRef f64Func,
	StringRef f32ApproxFunc, StringRef f16Func,
	StringRef i32Func = "",
	PatternBenefit benefit = 1)
	: ConvertOpToLLVMPattern<SourceOp>(lowering, benefit), f32Func(f32Func),
	f64Func(f64Func), f32ApproxFunc(f32ApproxFunc), f16Func(f16Func),
	i32Func(i32Func) {}

	LogicalResult
	matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
	ConversionPatternRewriter &rewriter) const override {
	using LLVM::LLVMFuncOp;

	static_assert(
	std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
	"expected single result op");

	bool isResultBool = op->getResultTypes().front().isInteger(1);
	if constexpr (!std::is_base_of<OpTrait::SameOperandsAndResultType<SourceOp>,
	SourceOp>::value) {
	assert(op->getNumOperands() > 0 &&
	"expected op to take at least one operand");
	assert((op->getResultTypes().front() == op->getOperand(0).getType() \|\|
	isResultBool) &&
	"expected op with same operand and result types");
	}

	if (!op->template getParentOfType<FunctionOpInterface>()) {
	return rewriter.notifyMatchFailure(
	op, "expected op to be within a function region");
	}

	SmallVector<Value, 1> castedOperands;
	for (Value operand : adaptor.getOperands())
	castedOperands.push_back(maybeCast(operand, rewriter));

	Type castedOperandType = castedOperands.front().getType();

	// At ABI level, booleans are treated as i32.
	Type resultType =
	isResultBool ? rewriter.getIntegerType(32) : castedOperandType;
	Type funcType = getFunctionType(resultType, castedOperands);
	StringRef funcName = getFunctionName(castedOperandType, op);
	if (funcName.empty())
	return failure();

	LLVMFuncOp funcOp = appendOrGetFuncOp(funcName, funcType, op);
	auto callOp =
	rewriter.create<LLVM::CallOp>(op->getLoc(), funcOp, castedOperands);

	if (resultType == adaptor.getOperands().front().getType()) {
	rewriter.replaceOp(op, {callOp.getResult()});
	return success();
	}

	// Boolean result are mapping to i32 at the ABI level with zero values being
	// interpreted as false and non-zero values being interpreted as true. Since
	// there is no guarantee of a specific value being used to indicate true,
	// compare for inequality with zero (rather than truncate or shift).
	if (isResultBool) {
	Value zero = rewriter.create<LLVM::ConstantOp>(
	op->getLoc(), rewriter.getIntegerType(32),
	rewriter.getI32IntegerAttr(0));
	Value truncated = rewriter.create<LLVM::ICmpOp>(
	op->getLoc(), LLVM::ICmpPredicate::ne, callOp.getResult(), zero);
	rewriter.replaceOp(op, {truncated});
	return success();
	}

	assert(callOp.getResult().getType().isF32() &&
	"only f32 types are supposed to be truncated back");
	Value truncated = rewriter.create<LLVM::FPTruncOp>(
	op->getLoc(), adaptor.getOperands().front().getType(),
	callOp.getResult());
	rewriter.replaceOp(op, {truncated});
	return success();
	}

	Value maybeCast(Value operand, PatternRewriter &rewriter) const {
	Type type = operand.getType();
	if (!isa<Float16Type, BFloat16Type>(type))
	return operand;

	// If there's an f16 function, no need to cast f16 values.
	if (!f16Func.empty() && isa<Float16Type>(type))
	return operand;

	return rewriter.create<LLVM::FPExtOp>(
	operand.getLoc(), Float32Type::get(rewriter.getContext()), operand);
	}

	Type getFunctionType(Type resultType, ValueRange operands) const {
	SmallVector<Type> operandTypes(operands.getTypes());
	return LLVM::LLVMFunctionType::get(resultType, operandTypes);
	}

	LLVM::LLVMFuncOp appendOrGetFuncOp(StringRef funcName, Type funcType,
	Operation *op) const {
	using LLVM::LLVMFuncOp;

	auto funcAttr = StringAttr::get(op->getContext(), funcName);
	auto funcOp =
	SymbolTable::lookupNearestSymbolFrom<LLVMFuncOp>(op, funcAttr);
	if (funcOp)
	return funcOp;

	auto parentFunc = op->getParentOfType<FunctionOpInterface>();
	assert(parentFunc && "expected there to be a parent function");
	OpBuilder b(parentFunc);
	return b.create<LLVMFuncOp>(op->getLoc(), funcName, funcType);
	}

	StringRef getFunctionName(Type type, SourceOp op) const {
	bool useApprox = false;
	if constexpr (llvm::is_detected<has_get_fastmath_t, SourceOp>::value) {
	arith::FastMathFlags flag = op.getFastmath();
	useApprox = ((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) &&
	!f32ApproxFunc.empty();
	}

	if (isa<Float16Type>(type))
	return f16Func;
	if (isa<Float32Type>(type)) {
	if (useApprox)
	return f32ApproxFunc;
	return f32Func;
	}
	if (isa<Float64Type>(type))
	return f64Func;

	if (type.isInteger(32))
	return i32Func;
	return "";
	}

	const std::string f32Func;
	const std::string f64Func;
	const std::string f32ApproxFunc;
	const std::string f16Func;
	const std::string i32Func;
	};

	} // namespace mlir

	#endif // MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_