mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h - third_party/github.com/llvm/llvm-project - Git at Google

 //===- Transforms.h - Linalg transformations as patterns --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #ifndef DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H_
 #define DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H_

 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/Identifier.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/Bufferize.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"

 namespace mlir {
 class BufferizeTypeConverter;
 class FrozenRewritePatternList;

 namespace linalg {

 struct LinalgFusionOptions;
 struct LinalgTilingOptions;

 //===----------------------------------------------------------------------===//
 // Transformations exposed as function calls.
 //===----------------------------------------------------------------------===//
 using LinalgLoops = SmallVector<Operation *, 4>;

 struct TiledLinalgOp {
   LinalgOp op;
   SmallVector<Operation *, 8> loops;
   SmallVector<Value, 4> tensorResults;
 };

 /// Populates patterns for vectorization of all ConvN-D ops.
 void populateConvVectorizationPatterns(
     MLIRContext *context, SmallVectorImpl<OwningRewritePatternList> &patterns,
     ArrayRef<int64_t> tileSizes);

 /// Populates the given list with patterns to bufferize linalg ops.
 void populateLinalgBufferizePatterns(MLIRContext *context,
                                      BufferizeTypeConverter &converter,
                                      OwningRewritePatternList &patterns);

 /// Performs standalone tiling of a single LinalgOp by `tileSizes`.
 /// and permute the loop nest according to `interchangeVector`
 /// The permutation is expressed as a list of integers that specify
 /// the new ordering of the loop nest. The length of `interchangeVector`
 /// must be equal to the length of `tileSizes`.
 /// An empty vector is interpreted as the identity permutation and the
 /// transformation returns early.
 ///
 /// Returns a struct containing the tiled loops in the specified order
 /// and the cloned op if successful, llvm::None otherwise.
 ///
 /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed by
 /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
 /// integers, in the range 0..`tileSizes.size()` without duplications
 /// (i.e. `[1,1,2]` is an invalid permutation).
 Optional<TiledLinalgOp> tileLinalgOp(OpBuilder &b, LinalgOp op,
                                      const LinalgTilingOptions &options);

 /// Fuse a sequence of linalg operations (`ops`) using tile-and-fuse. This
 /// proceeds as follows:
 /// - Find outer parallel loops in these ops that can be fused.
 /// - Tile fusable outer parallel loops of the last operation in the sequence.
 /// - Fuse the remaining operations with the tiled operation
 ///
 /// For example, consider the sequence of matmul below
 ///
 ///   linalg.matmul ins(%arg0, %arg1 : memref<256x32xf32>, memref<32x32xf32>)
 ///                 outs(%arg2 : memref<256x32xf32>)
 ///   linalg.matmul ins(%arg2, %arg3 : memref<256x32xf32>, memref<32x32xf32>)
 ///                 outs(%arg4 : memref<256x32xf32>)
 ///
 /// It is legal to fuse the RAW dependence (through %arg2) by only fusing the
 /// matmuls row-wise. For example, the fused computation for the above is shown
 /// below. The outer `scf.parallel` loop is the "fused" loop obtained by tiling
 /// along the rows of the matrix. The entire rows of the first matmul operation
 /// need to be computed before they can be used for the second matmul. The
 /// second matmul is further tiled (similar to normal tiling).
 ///
 /// #map0 = affine_map<(d0, d1)[s0] -> (d0 * 32 + s0 + d1)>
 /// #map1 = affine_map<(d0, d1) -> (d0 * 32 + d1)>
 /// scf.parallel (%arg5) = (%c0) to (%c256) step (%c16) {
 ///   %0 = subview %arg2[%arg5, 0] [16, 32] [1, 1]
 ///     : memref<256x32xf32> to memref<16x32xf32, #map0>
 ///   %1 = subview %arg4[%arg5, 0] [16, 32] [1, 1]
 ///     : memref<256x32xf32> to memref<16x32xf32, #map0>
 ///   %2 = subview %arg0[%arg5, 0] [16, 32] [1, 1]
 ///     : memref<256x32xf32> to memref<16x32xf32, #map0>
 ///   %3 = subview %arg1[0, 0] [32, 32] [1, 1]
 ///     : memref<32x32xf32> to memref<32x32xf32, #map1>
 ///   %4 = subview %arg3[0, 0] [32, 32] [1, 1]
 ///     : memref<32x32xf32> to memref<32x32xf32, #map1>
 ///   linalg.matmul
 ///     ins(%2, %3 : memref<16x32xf32, #map0>, memref<32x32xf32, #map1>)
 ///     outs(%0 : memref<16x32xf32, #map0>)
 ///   linalg.matmul
 ///     ins(%0, %4 : memref<16x4xf32, #map0>, memref<4x8xf32, #map0>)
 ///     outs(%1 : memref<16x8xf32, #map0>)
 /// }
 ///
 /// `tilingOptions` are used to tile the corresponding operation in `ops` (the
 /// size of the former should be same as size of the latter. Based on how
 /// tile+fuse is implemented, the fused loops are generated based on the last
 /// operation in the sequence. For example, the tile sizes for the fused loops
 /// is obtained from `tilingOptions.back()`. The following tiling options are
 /// handled differently in tile+fuse (compared to tile only)
 /// - Interchange of the tiling loops is not supported right now.
 /// - Only the fused loops are distributed.
 struct TiledAndFusedLinalgOps {
   /// Operation obtained by tiling the last operation in sequence of `ops`
   /// passed to `tileAndFuseLinalgOps`.
   LinalgOp op;
   /// The dimension of the loops that are fused.
   std::set<unsigned> fusedLoopDims;
   /// The generated fused operations (created within the fused loops).
   SmallVector<LinalgOp, 1> fusedProducers;
   /// The fused loop generated.
   SmallVector<Operation *, 4> fusedLoops;
 };
 Optional<TiledAndFusedLinalgOps>
 tileAndFuseLinalgOps(OpBuilder &builder, ArrayRef<LinalgOp> ops,
                      const LinalgDependenceGraph &dependenceGraph,
                      const LinalgTilingOptions &tilingOptions);

 /// Interchanges the `iterator_types` and `iterator_maps` dimensions of `op`.
 /// This is an in-place transformation controlled by `interchangeVector`.
 /// An empty vector is interpreted as the identity permutation and the
 /// transformation returns early.
 ///
 /// E.g. the permutation `(i,j,k) -> (j,k,i)` is expressed with
 /// `interchangeVector = [1,2,0]`. All values in `interchangeVector` must be
 /// integers, in the range 0..`op.rank` without duplications
 /// (i.e. `[1,1,2]` is an invalid permutation).
 LinalgOp interchange(LinalgOp op, ArrayRef<unsigned> interchangeVector);

 /// Callback function type used to perform the allocation for the promoted
 /// `subView`. In `boundingSubViewsize` a best attempt is made to find the
 /// smallest constant value for the size of the buffer needed for each
 /// dimension. If that is not possible, contains the dynamic size of the
 /// subview. The call back should return the buffer to use.
 using AllocBufferCallbackFn = std::function<Optional<Value>(
     OpBuilder &b, SubViewOp subView, ArrayRef<Value> boundingSubViewSize,
     OperationFolder *folder)>;

 /// Callback function type used to deallocate the buffers used to hold the
 /// promoted subview.
 using DeallocBufferCallbackFn =
     std::function<LogicalResult(OpBuilder &b, Value buffer)>;

 /// Callback function type used to insert copy from original subview to subview
 /// of the promoted region for the read operands/subview of promoted region to
 /// original subview for the results. The copy has to happen from `src` to
 /// `dst`.
 using CopyCallbackFn =
     std::function<LogicalResult(OpBuilder &b, Value src, Value dst)>;

 struct LinalgPromotionOptions {
   /// Indices of subViews to promote. If `None`, try to promote all operands.
   Optional<DenseSet<unsigned>> operandsToPromote = None;
   LinalgPromotionOptions &setOperandsToPromote(ArrayRef<int64_t> operands) {
     operandsToPromote = DenseSet<unsigned>();
     operandsToPromote->insert(operands.begin(), operands.end());
     return *this;
   }
   /// If ith element of `useFullTiles` is true the full view should be used for
   /// the promoted buffer of the ith operand in `operandsToPromote`. Otherwise
   /// the partial view will be used.
   /// The decision is defaulted to `useFullTileBuffersDefault` when
   /// `useFullTileBuffers` is None and for operands missing from
   /// `useFullTileBuffers`.
   Optional<llvm::SmallBitVector> useFullTileBuffers = None;
   LinalgPromotionOptions &setUseFullTileBuffers(ArrayRef<bool> useFullTiles) {
     unsigned size = useFullTiles.size();
     llvm::SmallBitVector tmp(size, false);
     for (unsigned i = 0; i < size; ++i)
       tmp[i] = useFullTiles[i];
     useFullTileBuffers = tmp;
     return *this;
   }
   /// If true all operands unspecified by `useFullTileBuffers` will use the full
   /// view, otherwise the partial view.
   bool useFullTileBuffersDefault = false;
   LinalgPromotionOptions &setUseFullTileBuffersByDefault(bool use) {
     useFullTileBuffersDefault = use;
     return *this;
   }
   /// Allow the use of dynamically-sized buffers.
   bool dynamicBuffers = false;
   LinalgPromotionOptions &setDynamicBuffers(unsigned dynamic) {
     dynamicBuffers = dynamic;
     return *this;
   }
   /// Alignment of promoted buffer. If `None` do not specify alignment.
   Optional<unsigned> alignment = None;
   LinalgPromotionOptions &setAlignment(unsigned align) {
     alignment = align;
     return *this;
   }
   /// Use alloca with the default allocation scheme.
   bool useAlloca = false;
   LinalgPromotionOptions &setUseAlloca(bool use) {
     useAlloca = use;
     return *this;
   }
   /// Callback function to do the allocation of the promoted buffer. If None,
   /// then the default allocation scheme of allocating a memref<?xi8> buffer
   /// followed by a view operation is used.
   Optional<AllocBufferCallbackFn> allocationFn = None;
   Optional<DeallocBufferCallbackFn> deallocationFn = None;
   LinalgPromotionOptions &
   setAllocationDeallocationFns(AllocBufferCallbackFn const &allocFn,
                                DeallocBufferCallbackFn const &deallocFn) {
     allocationFn = allocFn;
     deallocationFn = deallocFn;
     return *this;
   }
   /// Callback function to do the copy of data to and from the promoted
   /// subview. If None then a linalg.copy is used.
   Optional<CopyCallbackFn> copyInFn = None;
   Optional<CopyCallbackFn> copyOutFn = None;
   LinalgPromotionOptions &setCopyInOutFns(CopyCallbackFn const &copyIn,
                                           CopyCallbackFn const &copyOut) {
     copyInFn = copyIn;
     copyOutFn = copyOut;
     return *this;
   }
 };

 /// Creates a new buffer using the `allocationFn` provided. The size of this
 /// buffer is the smallest constant bounding size along each dimension that can
 /// be computed for the size of the result of `subView`. Returns the allocated
 /// buffer as `fullLocalView` and the view that matches the size of the result
 /// of subview operation as `partialLocalView`.
 struct PromotionInfo {
   Value fullLocalView;
   Value partialLocalView;
 };
 Optional<PromotionInfo>
 promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, SubViewOp subView,
                           AllocBufferCallbackFn allocationFn,
                           OperationFolder *folder = nullptr);

 /// Promotes the `subViews` into a new buffer allocated at the insertion point
 /// `b`. Promotion occurs in 3 steps:
 ///   1. Create a new buffer for a full tile (i.e. not clipped at the boundary).
 ///   2. Take a full view on the buffer.
 ///   3. Take a partial slice of the full view in step 2. and copy into it.
 /// Infers statically sized buffers from subViews unless `dynamicBuffers` is
 /// true.
 ///
 /// Returns the modified linalg op (the modification happens in place) as well
 /// as all the copy ops created.
 Optional<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
                                    LinalgPromotionOptions options,
                                    OperationFolder *folder = nullptr);

 /// Emit a suitable vector form for a Linalg op with fully static shape.
 void vectorizeLinalgOp(OpBuilder &builder, Operation *op);

 /// Emits a loop nest of `LoopTy` with the proper body for `op`.
 template <typename LoopTy>
 Optional<LinalgLoops> linalgLowerOpToLoops(OpBuilder &builder, Operation *op);

 /// Emits a loop nest of `scf.for` with the proper body for `op`.
 LogicalResult linalgOpToLoops(OpBuilder &builder, Operation *op);

 /// Emits a loop nest of `scf.parallel` with the proper body for `op`.
 LogicalResult linalgOpToParallelLoops(OpBuilder &builder, Operation *op);

 /// Emits a loop nest of `affine.for` with the proper body for `op`.
 LogicalResult linalgOpToAffineLoops(OpBuilder &builder, Operation *op);

 //===----------------------------------------------------------------------===//
 // Preconditions that ensure the corresponding transformation succeeds and can
 // be applied as a rewrite pattern.
 //===----------------------------------------------------------------------===//
 /// Emits a `generic` or `indexed_generic` operation with the `indexing_maps`
 /// and `iterator_types` permutated according to `permutation`.
 LogicalResult
 interchangeGenericLinalgOpPrecondition(Operation *op,
                                        ArrayRef<unsigned> interchangeVector);

 /// Promote std.subviews feeding linalg operations.
 LogicalResult promoteSubviewsPrecondition(Operation *op,
                                           LinalgPromotionOptions options);

 /// Rewrite a linalg.generic into a suitable vector.contraction op.
 LogicalResult vectorizeLinalgOpPrecondition(Operation *op);

 //===----------------------------------------------------------------------===//
 // Transformations exposed as rewrite patterns.
 //===----------------------------------------------------------------------===//
 // Marker used as attribute name in generated Linalg rewriting transformations.
 struct LinalgTransforms {
   static const StringLiteral kLinalgTransformMarker;
 };

 /// Helper class to control common attribute matching and setting behavior.
 struct LinalgMarker {
   explicit LinalgMarker(ArrayRef<Identifier> matchDisjunction = {},
                         Optional<Identifier> replacement = None);
   LinalgMarker(LinalgMarker &&) = default;
   LinalgMarker(const LinalgMarker &) = default;
   LogicalResult checkAndNotify(PatternRewriter &rewriter, Operation *op) const;
   void replaceLinalgMarker(PatternRewriter &rewriter, Operation *op) const;

 private:
   SmallVector<Identifier, 4> matchDisjunction;
   Optional<Identifier> replacement;
 };

 ///
 /// Linalg tiling patterns.
 ///
 /// Apply the `tileLinalgOp` transformation as a pattern.
 /// `marker` controls LinalgTransformMarker matching and update when specified.
 /// See `tileLinalgOp` for more details.
 enum class LinalgTilingLoopType {
   Loops = 0,
   AffineLoops = 1,
   ParallelLoops = 2,
 };

 using TileSizeComputationFunction =
     std::function<SmallVector<Value, 4>(OpBuilder &, Operation *)>;

 struct LinalgTilingOptions {
   /// Computation function that returns the tile sizes for each operation.
   /// Delayed construction of constant tile sizes should occur to interoperate
   /// with folding.
   TileSizeComputationFunction tileSizeComputationFunction = nullptr;

   LinalgTilingOptions &
   setTileSizeComputationFunction(TileSizeComputationFunction fun) {
     tileSizeComputationFunction = std::move(fun);
     return *this;
   }
   /// Set the `tileSizeComputationFunction` to return the values `ts`. The
   /// values must not fold away when tiling. Otherwise, use a more robust
   /// `tileSizeComputationFunction`.
   LinalgTilingOptions &setTileSizes(SmallVector<Value, 4> ts) {
     tileSizeComputationFunction = [=](OpBuilder &, Operation *) { return ts; };
     return *this;
   }
   /// Convenience function to set the `tileSizeComputationFunction` to a
   /// function that computes tile sizes at the point they are needed. Allows
   /// proper interaction with folding.
   LinalgTilingOptions &setTileSizes(ArrayRef<int64_t> ts);

   /// The interchange vector to reorder the tiled loops.
   SmallVector<unsigned, 4> interchangeVector = {};

   LinalgTilingOptions &setInterchange(ArrayRef<unsigned> interchange) {
     interchangeVector.assign(interchange.begin(), interchange.end());
     return *this;
   }

   /// The type of tile loops to generate.
   LinalgTilingLoopType loopType = LinalgTilingLoopType::Loops;

   LinalgTilingOptions &setLoopType(LinalgTilingLoopType lt) {
     loopType = lt;
     return *this;
   }

   /// When specified, specifies distribution of generated tile loops to
   /// processors.
   Optional<LinalgLoopDistributionOptions> distribution = None;

   LinalgTilingOptions &
   setDistributionOptions(LinalgLoopDistributionOptions distributionOptions) {
     distribution = std::move(distributionOptions);
     return *this;
   }
 };

 /// Canonicalization patterns relevant to apply after tiling patterns. These are
 /// applied automatically by the tiling pass but need to be applied manually
 /// when tiling is called programmatically.
 OwningRewritePatternList
 getLinalgTilingCanonicalizationPatterns(MLIRContext *ctx);

 struct LinalgBaseTilingPattern : public RewritePattern {
   LinalgBaseTilingPattern(StringRef opName, MLIRContext *context,
                           LinalgTilingOptions options,
                           LinalgMarker marker = LinalgMarker(),
                           PatternBenefit benefit = 1);
   LogicalResult
   matchAndRewriteBase(Operation *op, PatternRewriter &rewriter,
                       SmallVectorImpl<Value> &tensorResults) const;

 private:
   /// LinalgTransformMarker handles special attribute manipulations.
   LinalgMarker marker;
   /// Options to control tiling;
   LinalgTilingOptions options;
 };

 template <typename OpTy>
 struct LinalgTilingPattern : public LinalgBaseTilingPattern {
   LinalgTilingPattern(MLIRContext *context, LinalgTilingOptions options,
                       LinalgMarker marker = LinalgMarker(),
                       PatternBenefit benefit = 1)
       : LinalgBaseTilingPattern(OpTy::getOperationName(), context, options,
                                 marker, benefit) {}
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
     SmallVector<Value, 4> tensorResults;
     if (failed(LinalgBaseTilingPattern::matchAndRewriteBase(op, rewriter,
                                                             tensorResults)))
       return failure();
     if (tensorResults.empty())
       rewriter.eraseOp(op);
     else
       rewriter.replaceOp(op, tensorResults);
     return success();
   }
 };

 struct LinalgFusionOptions {
   /// List of operands indices to use for fusion.
   llvm::SmallSet<unsigned, 1> indicesToFuse = {};
   LinalgFusionOptions &setIndicesToFuse(ArrayRef<int64_t> operands) {
     indicesToFuse.insert(operands.begin(), operands.end());
     return *this;
   }
 };

 struct LinalgBaseTileAndFusePattern : public RewritePattern {
   LinalgBaseTileAndFusePattern(StringRef opName, MLIRContext *context,
                                const LinalgDependenceGraph &dependenceGraph,
                                LinalgTilingOptions tilingOptions,
                                LinalgFusionOptions fusionOptions,
                                LinalgMarker marker = LinalgMarker(),
                                LinalgMarker fusedOpMarker = LinalgMarker(),
                                LinalgMarker originalOpMarker = LinalgMarker(),
                                PatternBenefit benefit = 1);
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override;

 private:
   /// Dependence graph needed for fusion.
   const LinalgDependenceGraph &dependenceGraph;
   /// Options to control tiling.
   LinalgTilingOptions tilingOptions;
   /// Options to control fusion.
   LinalgFusionOptions fusionOptions;
   /// Marker to control application of the pattern.
   LinalgMarker marker;
   /// Marker set on the fused op after tile and fuse.
   LinalgMarker fusedOpMarker;
   /// The dependenceGraph is not modifiable, i.e. if the Linalg operations used
   /// to build the dependence graph changes then the dependenceGraph needs to be
   /// recomputed right now. To not invalidate the dependenceGraph as
   /// transformation happens, the original producer can be tagged with a marker
   /// that can be later used to delete the original operations.
   LinalgMarker originalOpMarker;
 };

 template <typename OpTy>
 struct LinalgTileAndFusePattern : public LinalgBaseTileAndFusePattern {
   LinalgTileAndFusePattern(MLIRContext *context,
                            const LinalgDependenceGraph &dependenceGraph,
                            LinalgTilingOptions tilingOptions,
                            LinalgFusionOptions fusionOptions,
                            LinalgMarker marker = LinalgMarker(),
                            LinalgMarker fusedOpMarker = LinalgMarker(),
                            LinalgMarker originalOpMarker = LinalgMarker(),
                            PatternBenefit benefit = 1)
       : LinalgBaseTileAndFusePattern(
             OpTy::getOperationName(), context, dependenceGraph, tilingOptions,
             fusionOptions, marker, fusedOpMarker, originalOpMarker, benefit) {}
 };

 ///
 /// Linalg interchange patterns.
 ///
 /// Apply the `interchange` transformation as a pattern.
 /// `marker` controls LinalgTransformMarker matching and update when specified.
 /// See `interchange` for more details.
 struct LinalgBaseInterchangePattern : public RewritePattern {
   LinalgBaseInterchangePattern(StringRef opName, MLIRContext *context,
                                ArrayRef<unsigned> interchangeVector,
                                LinalgMarker marker = LinalgMarker(),
                                PatternBenefit benefit = 1);
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override;

 private:
   /// LinalgTransformMarker handles special attribute manipulations.
   LinalgMarker marker;
   /// The interchange vector to reorder the iterators and indexing_maps dims.
   SmallVector<unsigned, 8> interchangeVector;
 };

 template <typename OpTy>
 struct LinalgInterchangePattern : public LinalgBaseInterchangePattern {
   LinalgInterchangePattern(MLIRContext *context,
                            ArrayRef<unsigned> interchangeVector,
                            LinalgMarker marker = LinalgMarker(),
                            PatternBenefit benefit = 1)
       : LinalgBaseInterchangePattern(OpTy::getOperationName(), context,
                                      interchangeVector, marker, benefit) {}
 };

 ///
 /// Linalg promotion patterns.
 ///
 /// Apply the `promoteSubViews` transformation as a pattern.
 /// `marker` controls LinalgTransformMarker matching and update when specified.
 /// See `promoteSubViews` for more details.
 struct LinalgBasePromotionPattern : public RewritePattern {
   LinalgBasePromotionPattern(StringRef opName, MLIRContext *context,
                              LinalgPromotionOptions options,
                              LinalgMarker marker = LinalgMarker(),
                              PatternBenefit benefit = 1);
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override;

 private:
   /// LinalgTransformMarker handles special attribute manipulations.
   LinalgMarker marker;
   /// Promotion options.
   LinalgPromotionOptions options;
 };

 template <typename OpTy>
 struct LinalgPromotionPattern : public LinalgBasePromotionPattern {
   LinalgPromotionPattern(MLIRContext *context, LinalgPromotionOptions options,
                          LinalgMarker marker = LinalgMarker(),
                          PatternBenefit benefit = 1)
       : LinalgBasePromotionPattern(OpTy::getOperationName(), context, options,
                                    marker, benefit) {}
 };

 ///
 /// Linalg vectorization patterns.
 ///
 /// Apply the `vectorizeLinalgOp` transformation as a pattern.
 /// `marker` controls LinalgTransformMarker matching and update when specified.
 /// See `vectorizeLinalgOp` for more details.
 struct LinalgBaseVectorizationPattern : public RewritePattern {
   LinalgBaseVectorizationPattern(StringRef opName, MLIRContext *context,
                                  LinalgMarker marker = LinalgMarker(),
                                  PatternBenefit benefit = 1);
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override;

 private:
   /// LinalgTransformMarker handles special attribute manipulations.
   LinalgMarker marker;
 };

 template <typename OpTy>
 struct LinalgVectorizationPattern : public LinalgBaseVectorizationPattern {
   LinalgVectorizationPattern(MLIRContext *context,
                              LinalgMarker marker = LinalgMarker(),
                              PatternBenefit benefit = 1)
       : LinalgBaseVectorizationPattern(OpTy::getOperationName(), context,
                                        marker, benefit) {}
 };

 ///
 /// Linalg lowering patterns.
 ///
 /// Apply the `linalgLowerOpToLoops` transformation as a pattern.
 /// `marker` controls LinalgTransformMarker matching and update when specified.
 /// See `linalgLowerOpToLoops` for more details.
 enum class LinalgLoweringType {
   LibraryCall = 0,
   Loops = 1,
   AffineLoops = 2,
   ParallelLoops = 3
 };
 template <typename OpTy>
 struct LinalgLoweringPattern : public RewritePattern {
   LinalgLoweringPattern(MLIRContext *context, LinalgLoweringType loweringType,
                         LinalgMarker marker = LinalgMarker(),
                         PatternBenefit benefit = 1)
       : RewritePattern(OpTy::getOperationName(), {}, benefit, context),
         marker(marker), loweringType(loweringType) {}
   // TODO: Move implementation to .cpp once named ops are auto-generated.
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
     LinalgOp linalgOp = dyn_cast<LinalgOp>(op);
     if (!linalgOp)
       return failure();
     if (failed(marker.checkAndNotify(rewriter, linalgOp)))
       return failure();

     if (loweringType == LinalgLoweringType::LibraryCall) {
       // TODO: Move lowering to library calls here.
       return failure();
     } else if (loweringType == LinalgLoweringType::Loops) {
       if (failed(linalgOpToLoops(rewriter, op)))
         return failure();
     } else if (loweringType == LinalgLoweringType::AffineLoops) {
       if (failed(linalgOpToAffineLoops(rewriter, op)))
         return failure();
     } else if (failed(linalgOpToParallelLoops(rewriter, op))) {
       return failure();
     }
     rewriter.eraseOp(op);
     return success();
   }

 private:
   /// LinalgTransformMarker handles special attribute manipulations.
   LinalgMarker marker;
   /// Controls whether the pattern lowers to library calls, scf.for, affine.for
   /// or scf.parallel.
   LinalgLoweringType loweringType;
 };

 /// Linalg generalization patterns

 /// Populates `patterns` with patterns to convert spec-generated named ops to
 /// linalg.generic ops.
 void populateLinalgNamedOpsGeneralizationPatterns(
     MLIRContext *context, OwningRewritePatternList &patterns,
     LinalgMarker marker = LinalgMarker());

 /// Populates `patterns` with patterns to convert linalg.conv ops to
 /// linalg.generic ops.
 void populateLinalgConvGeneralizationPatterns(
     MLIRContext *context, OwningRewritePatternList &patterns,
     LinalgMarker marker = LinalgMarker());

 //===----------------------------------------------------------------------===//
 // Op-specific patterns.
 //===----------------------------------------------------------------------===//
 /// Match and rewrite for the pattern:
 /// ```
 ///    %alloc = ...
 ///    [optional] %view = std.view %alloc ...
 ///    %subView = subview %allocOrView ...
 ///    [optional] linalg.fill(%allocOrView, %cst) ...
 ///    ...
 ///    linalg.copy(%in, %subView) ...
 ///    vector.transfer_read %allocOrView[...], %cst ...
 /// ```
 /// into
 /// ```
 ///    [unchanged] %alloc = ...
 ///    [unchanged] [optional] %view = std.view %alloc ...
 ///    [unchanged] [unchanged] %subView = subview %allocOrView ...
 ///    ...
 ///    vector.transfer_read %in[...], %cst ...
 /// ```
 /// Where there is no interleaved use between linalg.copy and transfer_read as
 /// well as no interleaved use between linalg.fill and linalg.copy (if
 /// linalg.fill is specified).
 /// This is a custom rewrite to forward partial reads (with optional fills) to
 /// vector.transfer_read.
 struct LinalgCopyVTRForwardingPattern
     : public OpRewritePattern<vector::TransferReadOp> {
   using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;

   LogicalResult matchAndRewrite(vector::TransferReadOp xferOp,
                                 PatternRewriter &rewriter) const override;
 };

 /// Match and rewrite for the pattern:
 /// ```
 ///    %alloc = ...
 ///    [optional] %view = std.view %alloc ...
 ///    %subView = subview %allocOrView...
 ///    ...
 ///    vector.transfer_write %..., %allocOrView[...]
 ///    linalg.copy(%subView, %out)
 /// ```
 /// into
 /// ```
 ///    [unchanged] %alloc = ...
 ///    [unchanged] [optional] %view = std.view %alloc ...
 ///    [unchanged] %subView = subview %allocOrView...
 ///    ...
 ///    vector.transfer_write %..., %out[...]
 /// ```
 /// Where there is no interleaved use between transfer_write and linalg.copy.
 /// This is a custom rewrite to forward partial writes to vector.transfer_write.
 struct LinalgCopyVTWForwardingPattern
     : public OpRewritePattern<vector::TransferWriteOp> {
   using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;

   LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp,
                                 PatternRewriter &rewriter) const override;
 };

 /// Canonicalize AffineMinOp operations in the context of enclosing scf.for and
 /// scf.parallel by:
 ///   1. building an affine map where uses of the induction variable of a loop
 ///   are replaced by either the min (i.e. `%lb`) of the max
 ///   (i.e. `%lb + %step * floordiv(%ub -1 - %lb, %step)`) expression, depending
 ///   on whether the induction variable is used with a positive or negative
 ///   coefficient.
 ///   2. checking whether any of the results of this affine map is known to be
 ///   greater than all other results.
 ///   3. replacing the AffineMinOp by the result of (2).
 // TODO: move to a more appropriate place when it is determined. For now Linalg
 // depends both on Affine and SCF but they do not depend on each other.
 struct AffineMinSCFCanonicalizationPattern
     : public OpRewritePattern<AffineMinOp> {
   using OpRewritePattern<AffineMinOp>::OpRewritePattern;

   LogicalResult matchAndRewrite(AffineMinOp minOp,
                                 PatternRewriter &rewriter) const override;
 };

 /// Converts Convolution op into vector contraction.
 ///
 /// Conversion expects ConvOp to have dimensions marked in the *mask* as
 /// false of size 1. This ensures that the ConvOp can be lowered to vector
 /// contraction of dimensions marked in the *mask* as true.
 ///
 /// A good example for vectorization is ConvNHWCOp which is 2D Conv op
 /// with channels as the last dimension. Let's vectorize last 3 dimensions.
 /// The initial op definition looks like this:
 /// ```
 /// linalg.conv_2d_nhwc  %arg0, %arg1, %arg2 :
 ///   (memref<1x3x3x3xf32>, memref<1x3x3x3xf32>, memref<?x?x?x?xf32>)
 /// ```
 /// This op can be expressed as a dot product between %arg0 (input) and
 /// %arg1 (kernel) which is written into first entry of %arg2 (output). This is
 /// the ConvOp this pass expects and converts into:
 /// ```
 /// #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 /// #map1 = affine_map<(d0, d1, d2) -> ()>
 /// .....
 /// %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %c0_f32
 ///   : memref<1x3x3x3xf32>, vector<3x3x3xf32>
 /// %1 = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %c0_f32
 ///   : memref<1x3x3x3xf32>, vector<3x3x3xf32>
 /// %2 = vector.contract {indexing_maps = [#map0, #map0, #map1],
 ///   iterator_types = ["reduction", "reduction", "reduction"]} %0, %1,
 ///   %c0_f32 : vector<3x3x3xf32>, vector<3x3x3xf32> into f32
 /// store %2, %arg2[%c0, %c0, %c0, %c0] : memref<?x?x?x?xf32>
 /// ```
 /// where first 2 operations read input and kernel memory buffers into vectors.
 /// Subsequently, they are contracted together and the result is written to
 /// the first entry of the output buffer.
 template <typename ConvOp, int N>
 class ConvOpVectorization : public OpRewritePattern<ConvOp> {
   using OpRewritePattern<ConvOp>::OpRewritePattern;
   SmallVector<bool, 4> mask;

 public:
   ConvOpVectorization(MLIRContext *context, SmallVector<bool, 4> msk)
       : OpRewritePattern<ConvOp>(context) {
     assert(msk.size() == N && "Mask size does not match rank");
     this->mask = msk;
   }

   LogicalResult matchAndRewrite(ConvOp minOp,
                                 PatternRewriter &rewriter) const override;
 };

 //===----------------------------------------------------------------------===//
 // Support for staged pattern application.
 //===----------------------------------------------------------------------===//
 /// Helper function to allow applying rewrite patterns, interleaved with more
 /// global transformations, in a staged fashion:
 ///   1. the first stage consists of a list of FrozenRewritePatternList. Each
 ///   FrozenRewritePatternList in this list is applied once, in order.
 ///   2. the second stage consists of a single OwningRewritePattern that is
 ///   applied greedily until convergence.
 ///   3. the third stage consists of applying a lambda, generally used for
 ///   non-local transformation effects. This allows creating custom fused
 ///   transformations where patterns can be ordered and applied at a finer
 ///   granularity than a sequence of traditional compiler passes.
 LogicalResult applyStagedPatterns(
     Operation *op, ArrayRef<FrozenRewritePatternList> stage1Patterns,
     const FrozenRewritePatternList &stage2Patterns,
     function_ref<LogicalResult(Operation *)> stage3Lambda = nullptr);

 //===----------------------------------------------------------------------===//
 // Support for sparse tensor code generation.
 //
 // The sparse compiler part of MLIR lowers a tensor expression formulated as a
 // Linalg operation into a sequence of loops depending on what dimensions of the
 // tensors are marked dense or sparse. The generated code distinguishes between:
 // (1) for-loops that iterate over a single dense dimension,
 // (2) for-loops that iterate over a single sparse dimension,
 // (3) while-loops that co-iterate over several sparse dimensions.
 // The for-loops may be subsequently optimized for parallel or vector execution.
 //
 // For more details, the Dialect/Linalg/Transforms/Sparsification.cpp file.
 //===----------------------------------------------------------------------===//

 /// Defines a parallelization strategy. Any implicit loop in the Linalg
 /// operation that is marked "parallel" (thus not "reduction") is a candidate
 /// for parallelization. The loop is made parallel if (1) allowed by the
 /// strategy (e.g., AnyStorageOuterLoop considers either a dense or sparse
 /// outermost loop only), and (2) the generated code is an actual for-loop
 /// (and not a co-iterating while-loop).
 enum class SparseParallelizationStrategy {
   kNone,
   kDenseOuterLoop,
   kAnyStorageOuterLoop,
   kDenseAnyLoop,
   kAnyStorageAnyLoop
   // TODO: support reduction parallelization too?
 };

 /// Defines a vectorization strategy. Any implicit inner loop in the Linalg
 /// operation is a candidate (full SIMD for "parallel" loops and horizontal
 /// SIMD for "reduction" loops). A loop is actually vectorized if (1) allowed
 /// by the strategy, and (2) the emitted code is an actual for-loop (and not
 /// a co-iterating while-loop).
 enum class SparseVectorizationStrategy {
   kNone,
   kDenseInnerLoop,
   kAnyStorageInnerLoop
 };

 /// Defines a type for "pointer" and "index" storage in the sparse storage
 /// scheme, with a choice between the native platform-dependent index width,
 /// 64-bit integers, or 32-bit integers. A narrow width obviously reduces
 /// the memory footprint of the sparse storage scheme, but the width should
 /// suffice to define the total required range (viz. the maximum number of
 /// stored entries per indirection level for the "pointers" and the maximum
 /// value of each tensor index over all dimensions for the "indices").
 enum class SparseIntType { kNative, kI64, kI32 };

 /// Sparsification options.
 struct SparsificationOptions {
   SparsificationOptions(SparseParallelizationStrategy p,
                         SparseVectorizationStrategy v, unsigned vl,
                         SparseIntType pt, SparseIntType it)
       : parallelizationStrategy(p), vectorizationStrategy(v), vectorLength(vl),
         ptrType(pt), indType(it) {}
   SparsificationOptions()
       : SparsificationOptions(SparseParallelizationStrategy::kNone,
                               SparseVectorizationStrategy::kNone, 1u,
                               SparseIntType::kNative, SparseIntType::kNative) {}
   SparseParallelizationStrategy parallelizationStrategy;
   SparseVectorizationStrategy vectorizationStrategy;
   unsigned vectorLength;
   SparseIntType ptrType;
   SparseIntType indType;
 };

 /// Set up sparsification rewriting rules with the given options.
 void populateSparsificationPatterns(
     MLIRContext *context, OwningRewritePatternList &patterns,
     const SparsificationOptions &options = SparsificationOptions());

 } // namespace linalg
 } // namespace mlir

 #endif // DIALECT_LINALG_TRANSFORMS_TRANSFORMS_H_