mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp - third_party/github.com/llvm/llvm-project - Git at Google

 //===- BufferizableOpInterfaceImpl.cpp - Impl. of BufferizableOpInterface -===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.h"

 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Bufferization/IR/DstBufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Operation.h"

 using namespace mlir;
 using namespace mlir::bufferization;
 using namespace mlir::tensor;

 namespace mlir {
 namespace tensor {
 namespace {

 struct CastOpInterface
     : public BufferizableOpInterface::ExternalModel<CastOpInterface,
                                                     tensor::CastOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     return false;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {{op->getResult(0), BufferRelation::Equivalent}};
   }

   FailureOr<BaseMemRefType>
   getBufferType(Operation *op, Value value, const BufferizationOptions &options,
                 SmallVector<Value> &invocationStack) const {
     auto castOp = cast<tensor::CastOp>(op);
     auto maybeSrcBufferType = bufferization::getBufferType(
         castOp.getSource(), options, invocationStack);
     if (failed(maybeSrcBufferType))
       return failure();
     Attribute memorySpace = maybeSrcBufferType->getMemorySpace();

     // Note: `getMemRefTypeWithFullyDynamicLayout` returns an unranked memref
     // type in case the input is an unranked tensor type.

     // Case 1: Casting an unranked tensor
     if (isa<UnrankedTensorType>(castOp.getSource().getType())) {
       // When casting to a ranked tensor, we cannot infer any static offset or
       // strides from the source. Assume fully dynamic.
       return getMemRefTypeWithFullyDynamicLayout(castOp.getType(), memorySpace);
     }

     // Case 2: Casting to an unranked tensor type
     if (isa<UnrankedTensorType>(castOp.getType())) {
       return getMemRefTypeWithFullyDynamicLayout(castOp.getType(), memorySpace);
     }

     // Case 3: Ranked tensor -> ranked tensor. The offsets and strides do not
     // change.
     auto rankedResultType = cast<RankedTensorType>(castOp.getType());
     return MemRefType::get(
         rankedResultType.getShape(), rankedResultType.getElementType(),
         llvm::cast<MemRefType>(*maybeSrcBufferType).getLayout(), memorySpace);
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto castOp = cast<tensor::CastOp>(op);

     // The result buffer still has the old (pre-cast) type.
     FailureOr<Value> resultBuffer =
         getBuffer(rewriter, castOp.getSource(), options);
     if (failed(resultBuffer))
       return failure();

     // Compute the new type.
     auto resultMemRefType =
         bufferization::getBufferType(castOp.getResult(), options);
     if (failed(resultMemRefType))
       return failure();
     if (resultBuffer->getType() == *resultMemRefType) {
       // This cast is a no-op.
       replaceOpWithBufferizedValues(rewriter, op, *resultBuffer);
       return success();
     }

     // Replace the op with a memref.cast.
     assert(memref::CastOp::areCastCompatible(resultBuffer->getType(),
                                              *resultMemRefType) &&
            "CallOp::bufferize: cast incompatible");
     replaceOpWithNewBufferizedOp<memref::CastOp>(
         rewriter, op, *resultMemRefType, *resultBuffer);

     return success();
   }
 };

 /// Bufferization of tensor.collapse_shape. Replace with memref.collapse_shape.
 struct CollapseShapeOpInterface
     : public BufferizableOpInterface::ExternalModel<CollapseShapeOpInterface,
                                                     tensor::CollapseShapeOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     // tensor.collapse_shape may reallocate, at which point the source buffer is
     // copied. I.e., there will be a memory read side effect on the bufferized
     // source. This function conservatively returns "true" because whether a
     // copy will be created or not is not known at this point.
     return true;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     // TODO: CollapseShapeOp may allocate at runtime.
     return {{op->getOpResult(0), BufferRelation::Equivalent}};
   }

   FailureOr<BaseMemRefType>
   getBufferType(Operation *op, Value value, const BufferizationOptions &options,
                 SmallVector<Value> &invocationStack) const {
     auto collapseShapeOp = cast<tensor::CollapseShapeOp>(op);
     auto maybeSrcBufferType = bufferization::getBufferType(
         collapseShapeOp.getSrc(), options, invocationStack);
     if (failed(maybeSrcBufferType))
       return failure();
     auto srcBufferType = llvm::cast<MemRefType>(*maybeSrcBufferType);
     bool canBeCollapsed = memref::CollapseShapeOp::isGuaranteedCollapsible(
         srcBufferType, collapseShapeOp.getReassociationIndices());

     if (!canBeCollapsed) {
       // If dims cannot be collapsed, this op bufferizes to a new allocation.
       RankedTensorType tensorResultType = collapseShapeOp.getResultType();
       return bufferization::getMemRefTypeWithStaticIdentityLayout(
           tensorResultType, srcBufferType.getMemorySpace());
     }

     return memref::CollapseShapeOp::computeCollapsedType(
         srcBufferType, collapseShapeOp.getReassociationIndices());
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto collapseShapeOp = cast<tensor::CollapseShapeOp>(op);
     RankedTensorType tensorResultType = collapseShapeOp.getResultType();
     FailureOr<Value> maybeBuffer =
         getBuffer(rewriter, collapseShapeOp.getSrc(), options);
     if (failed(maybeBuffer))
       return failure();
     Value buffer = *maybeBuffer;
     auto bufferType = cast<MemRefType>(buffer.getType());

     if (tensorResultType.getRank() == 0) {
       // 0-d collapses must go through a different op builder.
       MemRefType resultType;

       if (bufferType.getLayout().isIdentity()) {
         // Standard layout: result type has no offset.
         MemRefLayoutAttrInterface layout;
         resultType = MemRefType::get({}, tensorResultType.getElementType(),
                                      layout, bufferType.getMemorySpace());
       } else {
         // Source memref has a layout map: result type has the same offset as
         // the source type.
         SmallVector<int64_t> strides;
         int64_t offset;
         if (failed(getStridesAndOffset(bufferType, strides, offset)))
           return failure();
         resultType = MemRefType::get(
             {}, tensorResultType.getElementType(),
             StridedLayoutAttr::get(op->getContext(), offset, {}),
             bufferType.getMemorySpace());
       }

       replaceOpWithNewBufferizedOp<memref::CollapseShapeOp>(
           rewriter, op, resultType, buffer, collapseShapeOp.getReassociation());
       return success();
     }

     // If the dims are not collapsible (due to an incompatible source layout
     // map), force an out-of-place bufferization, i.e., a buffer copy. This
     // newly allocated buffer will have no layout map and thus be collapsible.
     bool canBeCollapsed = memref::CollapseShapeOp::isGuaranteedCollapsible(
         bufferType, collapseShapeOp.getReassociationIndices());
     if (!canBeCollapsed) {
       // TODO: Create alloc_tensor ops during TensorCopyInsertion.
       AnalysisState analysisState(options);
       FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
           rewriter, op->getLoc(), collapseShapeOp.getSrc(), options);
       if (failed(tensorAlloc))
         return failure();
       auto memrefType =
           MemRefType::get(collapseShapeOp.getSrcType().getShape(),
                           collapseShapeOp.getSrcType().getElementType(),
                           AffineMap(), bufferType.getMemorySpace());
       buffer = rewriter.create<bufferization::ToMemrefOp>(
           op->getLoc(), memrefType, *tensorAlloc);
     }

     // Result type is inferred by the builder.
     replaceOpWithNewBufferizedOp<memref::CollapseShapeOp>(
         rewriter, op, buffer, collapseShapeOp.getReassociationIndices());
     return success();
   }
 };

 /// Bufferization of tensor.dim. Replace with memref.dim.
 struct DimOpInterface
     : public BufferizableOpInterface::ExternalModel<DimOpInterface,
                                                     tensor::DimOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     // The op reads the tensor's metadata but not its contents.
     return false;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {};
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto dimOp = cast<tensor::DimOp>(op);
     FailureOr<Value> v = getBuffer(rewriter, dimOp.getSource(), options);
     if (failed(v))
       return failure();
     replaceOpWithNewBufferizedOp<memref::DimOp>(rewriter, op, *v,
                                                 dimOp.getIndex());
     return success();
   }
 };

 /// Bufferization of "tensor.empty". Replace with "bufferization.alloc_tensor".
 struct EmptyOpInterface
     : public BufferizableOpInterface::ExternalModel<EmptyOpInterface,
                                                     tensor::EmptyOp> {
   bool bufferizesToAllocation(Operation *op, Value value) const { return true; }

   bool resultBufferizesToMemoryWrite(Operation *op, OpResult opResult,
                                      const AnalysisState &state) const {
     // The returned tensor does not have specified contents.
     return false;
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto emptyOp = cast<tensor::EmptyOp>(op);

     // Optimization: Fold away the op if it has no uses.
     if (op->getUses().empty()) {
       rewriter.eraseOp(op);
       return success();
     }

     // Allocate a tensor. This emits a "bufferization.alloc_tensor" op.
     FailureOr<Value> allocTensor = allocateTensorForShapedValue(
         rewriter, op->getLoc(), emptyOp.getResult(), options, /*copy=*/false);
     if (failed(allocTensor))
       return failure();
     rewriter.replaceOp(op, *allocTensor);
     return success();
   }
 };

 /// Bufferization of tensor.expand_shape. Replace with memref.expand_shape.
 struct ExpandShapeOpInterface
     : public BufferizableOpInterface::ExternalModel<ExpandShapeOpInterface,
                                                     tensor::ExpandShapeOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     // In contrast to tensor.collapse_shape, this op can always be bufferized
     // without a copy.
     return false;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {{op->getOpResult(0), BufferRelation::Equivalent}};
   }

   FailureOr<BaseMemRefType>
   getBufferType(Operation *op, Value value, const BufferizationOptions &options,
                 SmallVector<Value> &invocationStack) const {
     auto expandShapeOp = cast<tensor::ExpandShapeOp>(op);
     auto maybeSrcBufferType = bufferization::getBufferType(
         expandShapeOp.getSrc(), options, invocationStack);
     if (failed(maybeSrcBufferType))
       return failure();
     auto srcBufferType = llvm::cast<MemRefType>(*maybeSrcBufferType);
     auto maybeResultType = memref::ExpandShapeOp::computeExpandedType(
         srcBufferType, expandShapeOp.getResultType().getShape(),
         expandShapeOp.getReassociationIndices());
     if (failed(maybeResultType))
       return failure();
     return *maybeResultType;
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto expandShapeOp = cast<tensor::ExpandShapeOp>(op);
     auto tensorResultType = expandShapeOp.getResultType();
     FailureOr<Value> buffer =
         getBuffer(rewriter, expandShapeOp.getSrc(), options);
     if (failed(buffer))
       return failure();

     // Memref result type is inferred by the builder based on reassociation
     // indices and result shape.
     replaceOpWithNewBufferizedOp<memref::ExpandShapeOp>(
         rewriter, op, tensorResultType.getShape(), *buffer,
         expandShapeOp.getReassociationIndices());
     return success();
   }
 };

 /// Bufferization of tensor.extract_slice. Replace with memref.subview.
 struct ExtractSliceOpInterface
     : public BufferizableOpInterface::ExternalModel<ExtractSliceOpInterface,
                                                     tensor::ExtractSliceOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     return false;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {{op->getOpResult(0), BufferRelation::Unknown}};
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto extractSliceOp = cast<tensor::ExtractSliceOp>(op);
     SmallVector<OpFoldResult> mixedOffsets = extractSliceOp.getMixedOffsets();
     SmallVector<OpFoldResult> mixedSizes = extractSliceOp.getMixedSizes();
     SmallVector<OpFoldResult> mixedStrides = extractSliceOp.getMixedStrides();
     Location loc = extractSliceOp.getLoc();

     // Get source buffer.
     FailureOr<Value> srcMemref =
         getBuffer(rewriter, extractSliceOp.getSource(), options);
     if (failed(srcMemref))
       return failure();

     // Take a subview of the source buffer.
     auto resultMemrefType =
         bufferization::getBufferType(extractSliceOp.getResult(), options);
     if (failed(resultMemrefType))
       return failure();
     Value subView = rewriter.create<memref::SubViewOp>(
         loc, llvm::cast<MemRefType>(*resultMemrefType), *srcMemref, mixedOffsets,
         mixedSizes, mixedStrides);

     replaceOpWithBufferizedValues(rewriter, op, subView);
     return success();
   }

   FailureOr<BaseMemRefType>
   getBufferType(Operation *op, Value value, const BufferizationOptions &options,
                 SmallVector<Value> &invocationStack) const {
     auto extractSliceOp = cast<tensor::ExtractSliceOp>(op);
     assert(value == extractSliceOp.getResult() && "invalid value");
     auto srcMemrefType = bufferization::getBufferType(
         extractSliceOp.getSource(), options, invocationStack);
     if (failed(srcMemrefType))
       return failure();
     SmallVector<OpFoldResult> mixedOffsets = extractSliceOp.getMixedOffsets();
     SmallVector<OpFoldResult> mixedSizes = extractSliceOp.getMixedSizes();
     SmallVector<OpFoldResult> mixedStrides = extractSliceOp.getMixedStrides();
     return cast<BaseMemRefType>(memref::SubViewOp::inferRankReducedResultType(
         extractSliceOp.getType().getShape(), llvm::cast<MemRefType>(*srcMemrefType),
         mixedOffsets, mixedSizes, mixedStrides));
   }
 };

 /// Bufferization of tensor.extract. Replace with memref.load.
 struct ExtractOpInterface
     : public BufferizableOpInterface::ExternalModel<ExtractOpInterface,
                                                     tensor::ExtractOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     return true;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {};
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto extractOp = cast<tensor::ExtractOp>(op);
     FailureOr<Value> srcMemref =
         getBuffer(rewriter, extractOp.getTensor(), options);
     if (failed(srcMemref))
       return failure();
     replaceOpWithNewBufferizedOp<memref::LoadOp>(rewriter, op, *srcMemref,
                                                  extractOp.getIndices());
     return success();
   }
 };

 // Implements backtracking to traverse indices of the output buffer while
 // iterating over op.elements().
 static void createStores(RewriterBase &rewriter, Location loc, int dim,
                          Value buffer, ArrayRef<int64_t> shape,
                          ArrayRef<Value> constants,
                          OperandRange::iterator &elementIt,
                          SmallVectorImpl<Value> &indices) {
   if (dim == static_cast<int>(shape.size()) - 1) {
     for (int i = 0; i < shape.back(); ++i) {
       indices.back() = constants[i];
       rewriter.create<memref::StoreOp>(loc, *elementIt, buffer, indices);
       ++elementIt;
     }
     return;
   }
   for (int i = 0; i < shape[dim]; ++i) {
     indices[dim] = constants[i];
     createStores(rewriter, loc, dim + 1, buffer, shape, constants, elementIt,
                  indices);
   }
 }

 /// Bufferization of tensor.from_elements.
 struct FromElementsOpInterface
     : public BufferizableOpInterface::ExternalModel<FromElementsOpInterface,
                                                     tensor::FromElementsOp> {

   bool bufferizesToAllocation(Operation *op, Value value) const { return true; }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto fromElementsOp = cast<tensor::FromElementsOp>(op);
     auto tensorType = cast<RankedTensorType>(fromElementsOp.getType());

     // TODO: Implement memory space for this op.
     if (options.defaultMemorySpaceFn(tensorType) != Attribute())
       return op->emitError("memory space not implemented yet");

     // Allocate a buffer for the result.
     Location loc = op->getLoc();
     auto shape = tensorType.getShape();
     // TODO: Create alloc_tensor ops during TensorCopyInsertion.
     FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
         rewriter, loc, fromElementsOp.getResult(), options,
         /*copy=*/false);
     if (failed(tensorAlloc))
       return failure();
     auto memrefType =
         MemRefType::get(tensorType.getShape(), tensorType.getElementType());
     Value buffer = rewriter.create<bufferization::ToMemrefOp>(
         op->getLoc(), memrefType, *tensorAlloc);

     // Case: tensor<0xelem_type>.
     if (fromElementsOp.getElements().empty()) {
       replaceOpWithBufferizedValues(rewriter, op, buffer);
       return success();
     }

     // Case: tensor<elem_type>.
     if (shape.empty()) {
       rewriter.create<memref::StoreOp>(
           loc, fromElementsOp.getElements().front(), buffer);
       replaceOpWithBufferizedValues(rewriter, op, buffer);
       return success();
     }

     // Create constants for the range of possible indices [0, max{shape_i}).
     auto maxDim = *llvm::max_element(shape);
     SmallVector<Value, 2> constants;
     constants.reserve(maxDim);
     for (int i = 0; i < maxDim; ++i)
       constants.push_back(rewriter.create<arith::ConstantIndexOp>(loc, i));

     // Traverse all `elements` and create `memref.store` ops.
     auto elementIt = fromElementsOp.getElements().begin();
     SmallVector<Value, 2> indices(tensorType.getRank(), constants[0]);
     createStores(rewriter, loc, /*dim=*/0, buffer, shape, constants, elementIt,
                  indices);

     replaceOpWithBufferizedValues(rewriter, op, buffer);

     return success();
   }
 };

 /// Lower the body of a tensor.generate like op (one index-typed bbArg per dim).
 /// Such ops are lowered to linalg.map with the given tensor as a destination.
 ///
 /// Example:
 /// ```
 /// %r = tensor.generate %x, %y {
 ///   ^bb0(%arg0: index, %arg1: index):
 ///   %0 = "some_op"(%arg0, %arg1) : (index, index) -> (index)
 ///   tensor.yield %0 : index
 /// } : tensor<?x?xindex>
 /// ```
 ///
 /// Is lowered to:
 /// ```
 /// linalg.map ins() outs(%dest) {
 ///   %d0 = linalg.index 0 : index
 ///   %d1 = linalg.index 1 : index
 ///   %0 = "some_op"(%d0, %d1) : (index, index) -> (index)
 ///   linalg.yield %0 : index
 /// }
 /// ```
 static Value lowerGenerateLikeOpBody(RewriterBase &rewriter, Location loc,
                                      Value tensorDestination,
                                      ValueRange dynamicSizes,
                                      Region &generateBody) {
   assert(generateBody.hasOneBlock() && "expected body with single block");
   auto tensorType = cast<RankedTensorType>(tensorDestination.getType());
   assert(generateBody.getNumArguments() == tensorType.getRank() &&
          "rank mismatch");

   // Create linalg::MapOp.
   OpBuilder::InsertionGuard g(rewriter);
   auto linalgOp =
       rewriter.create<linalg::MapOp>(loc, tensorType, /*inputs=*/ValueRange(),
                                      /*init=*/tensorDestination);
   Block &linalgBody = linalgOp.getMapper().emplaceBlock();

   // Create linalg::IndexOps.
   rewriter.setInsertionPointToStart(&linalgBody);
   SmallVector<Value> indices;
   for (int64_t dim = 0; dim < tensorType.getRank(); ++dim)
     indices.push_back(rewriter.create<linalg::IndexOp>(loc, dim));

   // Move over body.
   rewriter.mergeBlocks(&generateBody.front(), &linalgBody, indices);
   auto yieldOp = cast<tensor::YieldOp>(linalgBody.getTerminator());
   rewriter.replaceOpWithNewOp<linalg::YieldOp>(yieldOp, yieldOp.getValue());

   return linalgOp.getResult()[0];
 }

 /// Bufferization of tensor.generate.
 struct GenerateOpInterface
     : public BufferizableOpInterface::ExternalModel<GenerateOpInterface,
                                                     tensor::GenerateOp> {

   bool bufferizesToAllocation(Operation *op, Value value) const { return true; }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto generateOp = cast<tensor::GenerateOp>(op);

     auto type = generateOp.getResult().getType();

     // TODO: Implement memory space for this op.
     if (options.defaultMemorySpaceFn(type) != Attribute())
       return op->emitError("memory space not implemented yet");

     // Allocate memory.
     Location loc = op->getLoc();
     FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
         rewriter, loc, generateOp.getResult(), options,
         /*copy=*/false);
     if (failed(tensorAlloc))
       return failure();

     Value result = lowerGenerateLikeOpBody(rewriter, loc, *tensorAlloc,
                                            generateOp.getDynamicExtents(),
                                            generateOp.getBody());
     rewriter.replaceOp(generateOp, result);

     return success();
   }
 };

 /// Bufferization of tensor.insert. Replace with memref.store.
 ///
 /// Note: DstBufferizableOpInterfaceExternalModel provides many default method
 /// implementations for DestinationStyle ops.
 struct InsertOpInterface
     : public DstBufferizableOpInterfaceExternalModel<InsertOpInterface,
                                                      tensor::InsertOp> {
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto insertOp = cast<tensor::InsertOp>(op);
     FailureOr<Value> destMemref =
         getBuffer(rewriter, insertOp.getDest(), options);
     if (failed(destMemref))
       return failure();
     rewriter.create<memref::StoreOp>(insertOp.getLoc(), insertOp.getScalar(),
                                      *destMemref, insertOp.getIndices());
     replaceOpWithBufferizedValues(rewriter, op, *destMemref);
     return success();
   }
 };

 /// Bufferization of tensor.insert_slice. Replace with a memory copy. Under
 /// certain circumstances, this op can also be a no-op.
 ///
 /// Note: DstBufferizableOpInterfaceExternalModel provides many default method
 /// implementations for DestinationStyle ops.
 struct InsertSliceOpInterface
     : public DstBufferizableOpInterfaceExternalModel<InsertSliceOpInterface,
                                                      tensor::InsertSliceOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     auto insertSliceOp = cast<tensor::InsertSliceOp>(op);
     RankedTensorType destType = insertSliceOp.getDestType();

     // The source is always read.
     if (opOperand == insertSliceOp.getSourceMutable())
       return true;

     // For the destination, it depends...
     assert(opOperand == insertSliceOp.getDestMutable() && "expected dest");

     // Dest is not read if it is entirely overwritten. E.g.:
     // tensor.insert_slice %a into %t[0][10][1] : ... into tensor<10xf32>
     bool allOffsetsZero =
         llvm::all_of(insertSliceOp.getMixedOffsets(), [](OpFoldResult ofr) {
           return isConstantIntValue(ofr, 0);
         });
     bool sizesMatchDestSizes = llvm::all_of(
         llvm::enumerate(insertSliceOp.getMixedSizes()), [&](const auto &it) {
           return getConstantIntValue(it.value()) ==
                  destType.getDimSize(it.index());
         });
     bool allStridesOne =
         llvm::all_of(insertSliceOp.getMixedStrides(), [](OpFoldResult ofr) {
           return isConstantIntValue(ofr, 1);
         });
     return !(allOffsetsZero && sizesMatchDestSizes && allStridesOne);
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     // insert_slice ops arise from tiling and bufferizing them out-of-place is
     // generally a deal breaker. When used with loops, this ends up cloning the
     // whole tensor on every single iteration and is a symptom of a
     // catastrophically bad scheduling decision.
     // TODO: be very loud about it or even consider failing the pass.
     auto insertSliceOp = cast<tensor::InsertSliceOp>(op);
     SmallVector<OpFoldResult> mixedOffsets = insertSliceOp.getMixedOffsets();
     SmallVector<OpFoldResult> mixedSizes = insertSliceOp.getMixedSizes();
     SmallVector<OpFoldResult> mixedStrides = insertSliceOp.getMixedStrides();
     Location loc = insertSliceOp.getLoc();

     // Get destination buffer.
     FailureOr<Value> dstMemref =
         getBuffer(rewriter, insertSliceOp.getDest(), options);
     if (failed(dstMemref))
       return failure();

     // Take a subview of the destination buffer.
     auto dstMemrefType = cast<MemRefType>(dstMemref->getType());
     auto subviewMemRefType =
         cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
             insertSliceOp.getSourceType().getShape(), dstMemrefType,
             mixedOffsets, mixedSizes, mixedStrides));
     Value subView = rewriter.create<memref::SubViewOp>(
         loc, subviewMemRefType, *dstMemref, mixedOffsets, mixedSizes,
         mixedStrides);

     // Copy tensor. If this tensor.insert_slice has a matching
     // tensor.extract_slice, the copy operation will eventually fold away.
     FailureOr<Value> srcMemref =
         getBuffer(rewriter, insertSliceOp.getSource(), options);
     if (failed(srcMemref))
       return failure();
     if (failed(options.createMemCpy(rewriter, loc, *srcMemref, subView)))
       return failure();

     replaceOpWithBufferizedValues(rewriter, op, *dstMemref);
     return success();
   }
 };

 /// Bufferization of tensor.pad. Replace with bufferization.alloc_tensor +
 /// linalg.map + insert_slice.
 /// For best performance, vectorize before bufferization (better performance in
 /// case of padding with a constant).
 struct PadOpInterface
     : public BufferizableOpInterface::ExternalModel<PadOpInterface,
                                                     tensor::PadOp> {
   bool bufferizesToAllocation(Operation *op, Value value) const { return true; }

   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     return true;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {};
   }

   FailureOr<BaseMemRefType>
   getBufferType(Operation *op, Value value, const BufferizationOptions &options,
                 SmallVector<Value> &invocationStack) const {
     // Infer memory space from the source tensor.
     auto padOp = cast<tensor::PadOp>(op);
     auto maybeSrcBufferType = bufferization::getBufferType(
         padOp.getSource(), options, invocationStack);
     if (failed(maybeSrcBufferType))
       return failure();
     MemRefLayoutAttrInterface layout;
     return MemRefType::get(padOp.getResultType().getShape(),
                            padOp.getResultType().getElementType(), layout,
                            maybeSrcBufferType->getMemorySpace());
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto padOp = cast<tensor::PadOp>(op);
     Location loc = padOp.getLoc();
     RankedTensorType resultType = padOp.getResultType();
     RankedTensorType srcType = padOp.getSourceType();

     auto toValue = [&](OpFoldResult ofr) {
       if (ofr.is<Value>())
         return ofr.get<Value>();
       return rewriter
           .create<arith::ConstantIndexOp>(loc, *getConstantIntValue(ofr))
           .getResult();
     };

     // Compute dynamic result dimensions.
     SmallVector<OpFoldResult> mixedLowPad = padOp.getMixedLowPad();
     SmallVector<OpFoldResult> mixedHighPad = padOp.getMixedHighPad();
     SmallVector<Value> dynamicSizes;
     for (int64_t i = 0; i < resultType.getRank(); ++i) {
       if (!resultType.isDynamicDim(i))
         continue;
       Value srcDim = rewriter.create<tensor::DimOp>(loc, padOp.getSource(), i);
       Value lowPad = toValue(mixedLowPad[i]);
       Value highPad = toValue(mixedHighPad[i]);
       AffineExpr s0, s1, s2;
       bindSymbols(op->getContext(), s0, s1, s2);
       AffineExpr sumExpr = s0 + s1 + s2;
       Value sum = rewriter.create<affine::AffineApplyOp>(
           loc, sumExpr, ValueRange{srcDim, lowPad, highPad});
       dynamicSizes.push_back(sum);
     }

     // Allocate a buffer for the padded result.
     FailureOr<Value> tensorAlloc =
         allocateTensorForShapedValue(rewriter, loc, padOp.getResult(), options,
                                      /*copy=*/false);
     if (failed(tensorAlloc))
       return failure();

     // tensor::PadOp is like tensor::GenerateOp: The only difference is that
     // only a part of the generated tensor is needed. For simplicity, we reuse
     // the same functionality here.
     Value filledBuffer = lowerGenerateLikeOpBody(
         rewriter, loc, *tensorAlloc, dynamicSizes, padOp.getBodyRegion());

     // Create tensor::InsertSliceOp.
     SmallVector<OpFoldResult> sliceSizes =
         getMixedSizes(rewriter, loc, padOp.getSource());
     SmallVector<OpFoldResult> sliceStrides(srcType.getRank(),
                                            rewriter.getIndexAttr(1));
     rewriter.replaceOpWithNewOp<tensor::InsertSliceOp>(
         padOp, padOp.getSource(), filledBuffer,
         /*offsets=*/padOp.getMixedLowPad(), sliceSizes, sliceStrides);

     return success();
   }
 };

 /// Bufferization of tensor.rank. Replace with memref.rank.
 struct RankOpInterface
     : public BufferizableOpInterface::ExternalModel<RankOpInterface,
                                                     tensor::RankOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     // The op reads the tensor's metadata but not its contents.
     return false;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {};
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto rankOp = cast<tensor::RankOp>(op);
     FailureOr<Value> v = getBuffer(rewriter, rankOp.getTensor(), options);
     if (failed(v))
       return failure();
     replaceOpWithNewBufferizedOp<memref::RankOp>(rewriter, op, rankOp.getType(),
                                                  *v);
     return success();
   }
 };

 /// Bufferization of tensor.reshape. Replace with memref.reshape.
 struct ReshapeOpInterface
     : public BufferizableOpInterface::ExternalModel<ReshapeOpInterface,
                                                     tensor::ReshapeOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     // Depending on the layout map, the source buffer may have to be copied.
     auto reshapeOp = cast<tensor::ReshapeOp>(op);
     return opOperand == reshapeOp.getShapeMutable();
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     return false;
   }

   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {{op->getOpResult(0), BufferRelation::Equivalent}};
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     auto reshapeOp = cast<tensor::ReshapeOp>(op);
     FailureOr<Value> srcBuffer =
         getBuffer(rewriter, reshapeOp.getSource(), options);
     FailureOr<Value> shapeBuffer =
         getBuffer(rewriter, reshapeOp.getShape(), options);
     if (failed(srcBuffer) || failed(shapeBuffer))
       return failure();
     auto maybeResultMemRefType =
         bufferization::getBufferType(reshapeOp.getResult(), options);
     if (failed(maybeResultMemRefType))
       return failure();

     // memref.reshape requires the source buffer to have an identity layout.
     // If the source memref does not have an identity layout, copy the source
     // into a new buffer with an identity layout.
     auto srcType = llvm::dyn_cast<MemRefType>(srcBuffer->getType());
     if (srcType && !srcType.getLayout().isIdentity()) {
       FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
           rewriter, op->getLoc(), reshapeOp.getSource(), options);
       if (failed(tensorAlloc))
         return failure();
       auto memrefType = MemRefType::get(
           srcType.getShape(), srcType.getElementType(), AffineMap(),
           cast<BaseMemRefType>(srcBuffer->getType()).getMemorySpace());
       srcBuffer = rewriter
                       .create<bufferization::ToMemrefOp>(
                           op->getLoc(), memrefType, *tensorAlloc)
                       .getResult();
     }

     replaceOpWithNewBufferizedOp<memref::ReshapeOp>(
         rewriter, op, maybeResultMemRefType.value(), *srcBuffer, *shapeBuffer);
     return success();
   }

   FailureOr<BaseMemRefType>
   getBufferType(Operation *op, Value value, const BufferizationOptions &options,
                 SmallVector<Value> &invocationStack) const {
     auto reshapeOp = cast<tensor::ReshapeOp>(op);
     assert(value == reshapeOp.getResult() && "unexpected value provided");
     auto maybeSourceBufferType = bufferization::getBufferType(
         reshapeOp.getSource(), options, invocationStack);
     if (failed(maybeSourceBufferType))
       return failure();
     return getMemRefTypeWithStaticIdentityLayout(
         reshapeOp.getResult().getType(),
         cast<BaseMemRefType>(maybeSourceBufferType.value()).getMemorySpace());
   }
 };

 /// Analysis of ParallelInsertSliceOp.
 struct ParallelInsertSliceOpInterface
     : public BufferizableOpInterface::ExternalModel<
           ParallelInsertSliceOpInterface, ParallelInsertSliceOp> {
   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     return {};
   }

   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
     return true;
   }

   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     auto parallelInsertSliceOp = cast<ParallelInsertSliceOp>(op);
     return opOperand == parallelInsertSliceOp.getDestMutable();
   }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     OpBuilder::InsertionGuard g(rewriter);
     auto parallelInsertSliceOp = cast<ParallelInsertSliceOp>(op);
     ParallelCombiningOpInterface parallelCombiningParent =
         parallelInsertSliceOp.getParallelCombiningParent();

     // Bufferize the op outside of the parallel combining terminator.
     rewriter.setInsertionPoint(parallelCombiningParent);

     // Get source and destination buffers.
     FailureOr<Value> destBuffer =
         getBuffer(rewriter, parallelInsertSliceOp.getDest(), options);
     if (failed(destBuffer))
       return failure();
     FailureOr<Value> srcBuffer =
         getBuffer(rewriter, parallelInsertSliceOp.getSource(), options);
     if (failed(srcBuffer))
       return failure();

     // Take a subview of the destination buffer.
     auto destBufferType = cast<MemRefType>(destBuffer->getType());
     auto subviewMemRefType =
         cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
             parallelInsertSliceOp.getSourceType().getShape(), destBufferType,
             parallelInsertSliceOp.getMixedOffsets(),
             parallelInsertSliceOp.getMixedSizes(),
             parallelInsertSliceOp.getMixedStrides()));
     Value subview = rewriter.create<memref::SubViewOp>(
         parallelInsertSliceOp.getLoc(), subviewMemRefType, *destBuffer,
         parallelInsertSliceOp.getMixedOffsets(),
         parallelInsertSliceOp.getMixedSizes(),
         parallelInsertSliceOp.getMixedStrides());

     // This memcpy will fold away if everything bufferizes in-place.
     if (failed(options.createMemCpy(rewriter, parallelInsertSliceOp.getLoc(),
                                     *srcBuffer, subview)))
       return failure();

     // In case the source was allocated in the same block, make sure that the
     // deallocation op (if any) appears after the memcpy. By default, deallocs
     // are placed before the terminator, but this does not work for ForallOp
     // because the terminator does more than just yielding a value.
     //
     // Note: This is not a problem for the destination buffer because these are
     // assumed to always bufferize in-place.
     for (Operation *user : srcBuffer->getUsers()) {
       if (hasEffect<MemoryEffects::Free>(user)) {
         if (user->getBlock() == parallelCombiningParent->getBlock())
           rewriter.moveOpBefore(user, user->getBlock()->getTerminator());
         break;
       }
     }

     // Delete the op.
     rewriter.eraseOp(op);
     return success();
   }
 };

 /// Bufferization of tensor.splat. Bufferizes to a new allocation that is filled
 /// with a linalg.map. Similar to tensor.generate.
 struct SplatOpInterface
     : public BufferizableOpInterface::ExternalModel<SplatOpInterface,
                                                     tensor::SplatOp> {

   bool bufferizesToAllocation(Operation *op, Value value) const { return true; }

   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     OpBuilder::InsertionGuard g(rewriter);
     auto splatOp = cast<tensor::SplatOp>(op);

     // Allocate memory.
     Location loc = op->getLoc();
     FailureOr<Value> tensorAlloc = allocateTensorForShapedValue(
         rewriter, loc, splatOp.getResult(), options,
         /*copy=*/false);
     if (failed(tensorAlloc))
       return failure();

     // Create linalg::MapOp.
     auto tensorType = cast<RankedTensorType>(tensorAlloc->getType());

     // TODO: Implement memory space for this op.
     if (options.defaultMemorySpaceFn(tensorType) != Attribute())
       return op->emitError("memory space not implemented yet");

     auto linalgOp =
         rewriter.create<linalg::MapOp>(loc, tensorType, /*inputs=*/ValueRange(),
                                        /*init=*/*tensorAlloc);
     Block &linalgBody = linalgOp.getMapper().emplaceBlock();

     // Create linalg::IndexOps.
     rewriter.setInsertionPointToStart(&linalgBody);
     rewriter.create<linalg::YieldOp>(loc, splatOp.getInput());
     rewriter.replaceOp(splatOp, linalgOp.getResult()[0]);

     return success();
   }
 };

 } // namespace
 } // namespace tensor
 } // namespace mlir

 void mlir::tensor::registerBufferizableOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, tensor::TensorDialect *dialect) {
     CastOp::attachInterface<CastOpInterface>(*ctx);
     CollapseShapeOp::attachInterface<CollapseShapeOpInterface>(*ctx);
     DimOp::attachInterface<DimOpInterface>(*ctx);
     EmptyOp::attachInterface<EmptyOpInterface>(*ctx);
     ExpandShapeOp::attachInterface<ExpandShapeOpInterface>(*ctx);
     ExtractSliceOp::attachInterface<ExtractSliceOpInterface>(*ctx);
     ExtractOp::attachInterface<ExtractOpInterface>(*ctx);
     FromElementsOp::attachInterface<FromElementsOpInterface>(*ctx);
     GenerateOp::attachInterface<GenerateOpInterface>(*ctx);
     InsertOp::attachInterface<InsertOpInterface>(*ctx);
     InsertSliceOp::attachInterface<InsertSliceOpInterface>(*ctx);
     PadOp::attachInterface<PadOpInterface>(*ctx);
     ParallelInsertSliceOp::attachInterface<ParallelInsertSliceOpInterface>(
         *ctx);
     RankOp::attachInterface<RankOpInterface>(*ctx);
     ReshapeOp::attachInterface<ReshapeOpInterface>(*ctx);
     SplatOp::attachInterface<SplatOpInterface>(*ctx);

     // Load additional dialects of which ops may get created.
     ctx->loadDialect<arith::ArithDialect, linalg::LinalgDialect>();
   });

   // Bufferization requires SubsetInsertionOpInterface models. Make sure that
   // they are registered.
   tensor::registerSubsetOpInterfaceExternalModels(registry);
 }