mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h - third_party/github.com/llvm/llvm-project - Git at Google

 //===- VectorUtils.h - Vector Utilities -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #ifndef MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 #define MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_

 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"

 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/TypeSwitch.h"

 namespace mlir {

 // Forward declarations.
 class AffineMap;
 class Block;
 class Location;
 class OpBuilder;
 class Operation;
 class ShapedType;
 class Value;
 class VectorType;
 class VectorTransferOpInterface;

 namespace affine {
 class AffineApplyOp;
 class AffineForOp;
 } // namespace affine

 namespace vector {
 /// Helper function that creates a memref::DimOp or tensor::DimOp depending on
 /// the type of `source`.
 Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim);

 /// Returns two dims that are greater than one if the transposition is applied
 /// on a 2D slice. Otherwise, returns a failure.
 FailureOr<std::pair<int, int>> isTranspose2DSlice(vector::TransposeOp op);

 /// Return true if `vectorType` is a contiguous slice of `memrefType`.
 ///
 /// Only the N = vectorType.getRank() trailing dims of `memrefType` are
 /// checked (the other dims are not relevant). Note that for `vectorType` to be
 /// a contiguous slice of `memrefType`, the trailing dims of the latter have
 /// to be contiguous - this is checked by looking at the corresponding strides.
 ///
 /// There might be some restriction on the leading dim of `VectorType`:
 ///
 /// Case 1. If all the trailing dims of `vectorType` match the trailing dims
 ///         of `memrefType` then the leading dim of `vectorType` can be
 ///         arbitrary.
 ///
 ///        Ex. 1.1 contiguous slice, perfect match
 ///          vector<4x3x2xi32> from memref<5x4x3x2xi32>
 ///        Ex. 1.2 contiguous slice, the leading dim does not match (2 != 4)
 ///          vector<2x3x2xi32> from memref<5x4x3x2xi32>
 ///
 /// Case 2. If an "internal" dim of `vectorType` does not match the
 ///         corresponding trailing dim in `memrefType` then the remaining
 ///         leading dims of `vectorType` have to be 1 (the first non-matching
 ///         dim can be arbitrary).
 ///
 ///        Ex. 2.1 non-contiguous slice, 2 != 3 and the leading dim != <1>
 ///          vector<2x2x2xi32> from memref<5x4x3x2xi32>
 ///        Ex. 2.2  contiguous slice, 2 != 3 and the leading dim == <1>
 ///          vector<1x2x2xi32> from memref<5x4x3x2xi32>
 ///        Ex. 2.3. contiguous slice, 2 != 3 and the leading dims == <1x1>
 ///          vector<1x1x2x2xi32> from memref<5x4x3x2xi32>
 ///        Ex. 2.4. non-contiguous slice, 2 != 3 and the leading dims != <1x1>
 ///         vector<2x1x2x2xi32> from memref<5x4x3x2xi32>)
 bool isContiguousSlice(MemRefType memrefType, VectorType vectorType);

 /// Returns an iterator for all positions in the leading dimensions of `vType`
 /// up to the `targetRank`. If any leading dimension before the `targetRank` is
 /// scalable (so cannot be unrolled), it will return an iterator for positions
 /// up to the first scalable dimension.
 ///
 /// If no leading dimensions can be unrolled an empty optional will be returned.
 ///
 /// Examples:
 ///
 ///   For vType = vector<2x3x4> and targetRank = 1
 ///
 ///   The resulting iterator will yield:
 ///     [0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]
 ///
 ///   For vType = vector<3x[4]x5> and targetRank = 0
 ///
 ///   The scalable dimension blocks unrolling so the iterator yields only:
 ///     [0], [1], [2]
 ///
 std::optional<StaticTileOffsetRange>
 createUnrollIterator(VectorType vType, int64_t targetRank = 1);

 /// A wrapper for getMixedSizes for vector.transfer_read and
 /// vector.transfer_write Ops (for source and destination, respectively).
 ///
 /// Tensor and MemRef types implement their own, very similar version of
 /// getMixedSizes. This method will call the appropriate version (depending on
 /// `hasTensorSemantics`). It will also automatically extract the operand for
 /// which to call it on (source for "read" and destination for "write" ops).
 SmallVector<OpFoldResult> getMixedSizesXfer(bool hasTensorSemantics,
                                             Operation *xfer,
                                             RewriterBase &rewriter);

 /// A pattern for ops that implement `MaskableOpInterface` and that _might_ be
 /// masked (i.e. inside `vector.mask` Op region). In particular:
 ///   1. Matches `SourceOp` operation, Op.
 ///   2.1. If Op is masked, retrieves the masking Op, maskOp, and updates the
 ///     insertion point to avoid inserting new ops into the `vector.mask` Op
 ///     region (which only allows one Op).
 ///   2.2 If Op is not masked, this step is skipped.
 ///   3. Invokes `matchAndRewriteMaskableOp` on Op and optionally maskOp if
 ///     found in step 2.1.
 ///
 /// This wrapper frees patterns from re-implementing the logic to update the
 /// insertion point when a maskable Op is masked. Such patterns are still
 /// responsible for providing an updated ("rewritten") version of:
 ///   a. the source Op when mask _is not_ present,
 ///   b. the source Op and the masking Op when mask _is_ present.
 /// To use this pattern, implement `matchAndRewriteMaskableOp`. Note that
 /// the return value will depend on the case above.
 template <class SourceOp>
 struct MaskableOpRewritePattern : OpRewritePattern<SourceOp> {
   using OpRewritePattern<SourceOp>::OpRewritePattern;

 private:
   LogicalResult matchAndRewrite(SourceOp sourceOp,
                                 PatternRewriter &rewriter) const final {
     auto maskableOp = dyn_cast<MaskableOpInterface>(sourceOp.getOperation());
     if (!maskableOp)
       return failure();

     Operation *rootOp = sourceOp;

     // If this Op is masked, update the insertion point to avoid inserting into
     // the vector.mask Op region.
     OpBuilder::InsertionGuard guard(rewriter);
     MaskingOpInterface maskOp;
     if (maskableOp.isMasked()) {
       maskOp = maskableOp.getMaskingOp();
       rewriter.setInsertionPoint(maskOp);
       rootOp = maskOp;
     }

     FailureOr<Value> newOp =
         matchAndRewriteMaskableOp(sourceOp, maskOp, rewriter);
     if (failed(newOp))
       return failure();

     rewriter.replaceOp(rootOp, *newOp);
     return success();
   }

 public:
   // Matches `sourceOp` that can potentially be masked with `maskingOp`. If the
   // latter is present, returns a replacement for `maskingOp`. Otherwise,
   // returns a replacement for `sourceOp`.
   virtual FailureOr<Value>
   matchAndRewriteMaskableOp(SourceOp sourceOp, MaskingOpInterface maskingOp,
                             PatternRewriter &rewriter) const = 0;
 };

 /// Returns true if the input Vector type can be linearized.
 ///
 /// Linearization is meant in the sense of flattening vectors, e.g.:
 ///   * vector<NxMxKxi32> -> vector<N*M*Kxi32>
 /// In this sense, Vectors that are either:
 ///   * already linearized, or
 ///   * contain more than 1 scalable dimensions,
 /// are not linearizable.
 bool isLinearizableVector(VectorType type);

 /// Create a TransferReadOp from `source` with static shape `readShape`. If the
 /// vector type for the read is not the same as the type of `source`, then a
 /// mask is created on the read, if use of mask is specified or the bounds on a
 /// dimension are different.
 ///
 /// `useInBoundsInsteadOfMasking` if false, the inBoundsVal values are set
 /// properly, based on
 ///   the rank dimensions of the source and destination tensors. And that is
 ///   what determines if masking is done.
 ///
 /// Note that the internal `vector::TransferReadOp` always read at indices zero
 /// for each dimension of the passed in tensor.
 Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source,
                              ArrayRef<int64_t> readShape, Value padValue,
                              bool useInBoundsInsteadOfMasking);

 /// Returns success if `inputVectorSizes` is a valid masking configuraion for
 /// given `shape`, i.e., it meets:
 ///   1. The numbers of elements in both array are equal.
 ///   2. `inputVectorSizes` does not have dynamic dimensions.
 ///   3. All the values in `inputVectorSizes` are greater than or equal to
 ///      static sizes in `shape`.
 LogicalResult isValidMaskedInputVector(ArrayRef<int64_t> shape,
                                        ArrayRef<int64_t> inputVectorSizes);
 } // namespace vector

 /// Constructs a permutation map of invariant memref indices to vector
 /// dimension.
 ///
 /// If no index is found to be invariant, 0 is added to the permutation_map and
 /// corresponds to a vector broadcast along that dimension.
 ///
 /// The implementation uses the knowledge of the mapping of loops to
 /// vector dimension. `loopToVectorDim` carries this information as a map with:
 ///   - keys representing "vectorized enclosing loops";
 ///   - values representing the corresponding vector dimension.
 /// Note that loopToVectorDim is a whole function map from which only enclosing
 /// loop information is extracted.
 ///
 /// Prerequisites: `indices` belong to a vectorizable load or store operation
 /// (i.e. at most one invariant index along each AffineForOp of
 /// `loopToVectorDim`). `insertPoint` is the insertion point for the vectorized
 /// load or store operation.
 ///
 /// Example 1:
 /// The following MLIR snippet:
 ///
 /// ```mlir
 ///    affine.for %i3 = 0 to %0 {
 ///      affine.for %i4 = 0 to %1 {
 ///        affine.for %i5 = 0 to %2 {
 ///          %a5 = load %arg0[%i4, %i5, %i3] : memref<?x?x?xf32>
 ///    }}}
 /// ```
 ///
 /// may vectorize with {permutation_map: (d0, d1, d2) -> (d2, d1)} into:
 ///
 /// ```mlir
 ///    affine.for %i3 = 0 to %0 step 32 {
 ///      affine.for %i4 = 0 to %1 {
 ///        affine.for %i5 = 0 to %2 step 256 {
 ///          %4 = vector.transfer_read %arg0, %i4, %i5, %i3
 ///               {permutation_map: (d0, d1, d2) -> (d2, d1)} :
 ///               (memref<?x?x?xf32>, index, index) -> vector<32x256xf32>
 ///    }}}
 /// ```
 ///
 /// Meaning that vector.transfer_read will be responsible for reading the slice:
 /// `%arg0[%i4, %i5:%15+256, %i3:%i3+32]` into vector<32x256xf32>.
 ///
 /// Example 2:
 /// The following MLIR snippet:
 ///
 /// ```mlir
 ///    %cst0 = arith.constant 0 : index
 ///    affine.for %i0 = 0 to %0 {
 ///      %a0 = load %arg0[%cst0, %cst0] : memref<?x?xf32>
 ///    }
 /// ```
 ///
 /// may vectorize with {permutation_map: (d0) -> (0)} into:
 ///
 /// ```mlir
 ///    affine.for %i0 = 0 to %0 step 128 {
 ///      %3 = vector.transfer_read %arg0, %c0_0, %c0_0
 ///           {permutation_map: (d0, d1) -> (0)} :
 ///           (memref<?x?xf32>, index, index) -> vector<128xf32>
 ///    }
 /// ````
 ///
 /// Meaning that vector.transfer_read will be responsible of reading the slice
 /// `%arg0[%c0, %c0]` into vector<128xf32> which needs a 1-D vector broadcast.
 ///
 AffineMap
 makePermutationMap(Block *insertPoint, ArrayRef<Value> indices,
                    const DenseMap<Operation *, unsigned> &loopToVectorDim);
 AffineMap
 makePermutationMap(Operation *insertPoint, ArrayRef<Value> indices,
                    const DenseMap<Operation *, unsigned> &loopToVectorDim);

 namespace matcher {

 /// Matches vector.transfer_read, vector.transfer_write and ops that return a
 /// vector type that is a multiple of the sub-vector type. This allows passing
 /// over other smaller vector types in the function and avoids interfering with
 /// operations on those.
 /// This is a first approximation, it can easily be extended in the future.
 /// TODO: this could all be much simpler if we added a bit that a vector type to
 /// mark that a vector is a strict super-vector but it still does not warrant
 /// adding even 1 extra bit in the IR for now.
 bool operatesOnSuperVectorsOf(Operation &op, VectorType subVectorType);

 } // namespace matcher
 } // namespace mlir

 #endif // MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
	//===- VectorUtils.h - Vector Utilities -------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
	#define MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_

	#include "mlir/Dialect/MemRef/IR/MemRef.h"
	#include "mlir/Dialect/Tensor/IR/Tensor.h"
	#include "mlir/Dialect/Utils/IndexingUtils.h"
	#include "mlir/Dialect/Vector/IR/VectorOps.h"
	#include "mlir/IR/BuiltinAttributes.h"
	#include "mlir/Support/LLVM.h"

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/TypeSwitch.h"

	namespace mlir {

	// Forward declarations.
	class AffineMap;
	class Block;
	class Location;
	class OpBuilder;
	class Operation;
	class ShapedType;
	class Value;
	class VectorType;
	class VectorTransferOpInterface;

	namespace affine {
	class AffineApplyOp;
	class AffineForOp;
	} // namespace affine

	namespace vector {
	/// Helper function that creates a memref::DimOp or tensor::DimOp depending on
	/// the type of `source`.
	Value createOrFoldDimOp(OpBuilder &b, Location loc, Value source, int64_t dim);

	/// Returns two dims that are greater than one if the transposition is applied
	/// on a 2D slice. Otherwise, returns a failure.
	FailureOr<std::pair<int, int>> isTranspose2DSlice(vector::TransposeOp op);

	/// Return true if `vectorType` is a contiguous slice of `memrefType`.
	///
	/// Only the N = vectorType.getRank() trailing dims of `memrefType` are
	/// checked (the other dims are not relevant). Note that for `vectorType` to be
	/// a contiguous slice of `memrefType`, the trailing dims of the latter have
	/// to be contiguous - this is checked by looking at the corresponding strides.
	///
	/// There might be some restriction on the leading dim of `VectorType`:
	///
	/// Case 1. If all the trailing dims of `vectorType` match the trailing dims
	/// of `memrefType` then the leading dim of `vectorType` can be
	/// arbitrary.
	///
	/// Ex. 1.1 contiguous slice, perfect match
	/// vector<4x3x2xi32> from memref<5x4x3x2xi32>
	/// Ex. 1.2 contiguous slice, the leading dim does not match (2 != 4)
	/// vector<2x3x2xi32> from memref<5x4x3x2xi32>
	///
	/// Case 2. If an "internal" dim of `vectorType` does not match the
	/// corresponding trailing dim in `memrefType` then the remaining
	/// leading dims of `vectorType` have to be 1 (the first non-matching
	/// dim can be arbitrary).
	///
	/// Ex. 2.1 non-contiguous slice, 2 != 3 and the leading dim != <1>
	/// vector<2x2x2xi32> from memref<5x4x3x2xi32>
	/// Ex. 2.2 contiguous slice, 2 != 3 and the leading dim == <1>
	/// vector<1x2x2xi32> from memref<5x4x3x2xi32>
	/// Ex. 2.3. contiguous slice, 2 != 3 and the leading dims == <1x1>
	/// vector<1x1x2x2xi32> from memref<5x4x3x2xi32>
	/// Ex. 2.4. non-contiguous slice, 2 != 3 and the leading dims != <1x1>
	/// vector<2x1x2x2xi32> from memref<5x4x3x2xi32>)
	bool isContiguousSlice(MemRefType memrefType, VectorType vectorType);

	/// Returns an iterator for all positions in the leading dimensions of `vType`
	/// up to the `targetRank`. If any leading dimension before the `targetRank` is
	/// scalable (so cannot be unrolled), it will return an iterator for positions
	/// up to the first scalable dimension.
	///
	/// If no leading dimensions can be unrolled an empty optional will be returned.
	///
	/// Examples:
	///
	/// For vType = vector<2x3x4> and targetRank = 1
	///
	/// The resulting iterator will yield:
	/// [0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]
	///
	/// For vType = vector<3x[4]x5> and targetRank = 0
	///
	/// The scalable dimension blocks unrolling so the iterator yields only:
	/// [0], [1], [2]
	///
	std::optional<StaticTileOffsetRange>
	createUnrollIterator(VectorType vType, int64_t targetRank = 1);

	/// A wrapper for getMixedSizes for vector.transfer_read and
	/// vector.transfer_write Ops (for source and destination, respectively).
	///
	/// Tensor and MemRef types implement their own, very similar version of
	/// getMixedSizes. This method will call the appropriate version (depending on
	/// `hasTensorSemantics`). It will also automatically extract the operand for
	/// which to call it on (source for "read" and destination for "write" ops).
	SmallVector<OpFoldResult> getMixedSizesXfer(bool hasTensorSemantics,
	Operation *xfer,
	RewriterBase &rewriter);

	/// A pattern for ops that implement `MaskableOpInterface` and that _might_ be
	/// masked (i.e. inside `vector.mask` Op region). In particular:
	/// 1. Matches `SourceOp` operation, Op.
	/// 2.1. If Op is masked, retrieves the masking Op, maskOp, and updates the
	/// insertion point to avoid inserting new ops into the `vector.mask` Op
	/// region (which only allows one Op).
	/// 2.2 If Op is not masked, this step is skipped.
	/// 3. Invokes `matchAndRewriteMaskableOp` on Op and optionally maskOp if
	/// found in step 2.1.
	///
	/// This wrapper frees patterns from re-implementing the logic to update the
	/// insertion point when a maskable Op is masked. Such patterns are still
	/// responsible for providing an updated ("rewritten") version of:
	/// a. the source Op when mask _is not_ present,
	/// b. the source Op and the masking Op when mask _is_ present.
	/// To use this pattern, implement `matchAndRewriteMaskableOp`. Note that
	/// the return value will depend on the case above.
	template <class SourceOp>
	struct MaskableOpRewritePattern : OpRewritePattern<SourceOp> {
	using OpRewritePattern<SourceOp>::OpRewritePattern;

	private:
	LogicalResult matchAndRewrite(SourceOp sourceOp,
	PatternRewriter &rewriter) const final {
	auto maskableOp = dyn_cast<MaskableOpInterface>(sourceOp.getOperation());
	if (!maskableOp)
	return failure();

	Operation *rootOp = sourceOp;

	// If this Op is masked, update the insertion point to avoid inserting into
	// the vector.mask Op region.
	OpBuilder::InsertionGuard guard(rewriter);
	MaskingOpInterface maskOp;
	if (maskableOp.isMasked()) {
	maskOp = maskableOp.getMaskingOp();
	rewriter.setInsertionPoint(maskOp);
	rootOp = maskOp;
	}

	FailureOr<Value> newOp =
	matchAndRewriteMaskableOp(sourceOp, maskOp, rewriter);
	if (failed(newOp))
	return failure();

	rewriter.replaceOp(rootOp, *newOp);
	return success();
	}

	public:
	// Matches `sourceOp` that can potentially be masked with `maskingOp`. If the
	// latter is present, returns a replacement for `maskingOp`. Otherwise,
	// returns a replacement for `sourceOp`.
	virtual FailureOr<Value>
	matchAndRewriteMaskableOp(SourceOp sourceOp, MaskingOpInterface maskingOp,
	PatternRewriter &rewriter) const = 0;
	};

	/// Returns true if the input Vector type can be linearized.
	///
	/// Linearization is meant in the sense of flattening vectors, e.g.:
	/// * vector<NxMxKxi32> -> vector<NMKxi32>
	/// In this sense, Vectors that are either:
	/// * already linearized, or
	/// * contain more than 1 scalable dimensions,
	/// are not linearizable.
	bool isLinearizableVector(VectorType type);

	/// Create a TransferReadOp from `source` with static shape `readShape`. If the
	/// vector type for the read is not the same as the type of `source`, then a
	/// mask is created on the read, if use of mask is specified or the bounds on a
	/// dimension are different.
	///
	/// `useInBoundsInsteadOfMasking` if false, the inBoundsVal values are set
	/// properly, based on
	/// the rank dimensions of the source and destination tensors. And that is
	/// what determines if masking is done.
	///
	/// Note that the internal `vector::TransferReadOp` always read at indices zero
	/// for each dimension of the passed in tensor.
	Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source,
	ArrayRef<int64_t> readShape, Value padValue,
	bool useInBoundsInsteadOfMasking);

	/// Returns success if `inputVectorSizes` is a valid masking configuraion for
	/// given `shape`, i.e., it meets:
	/// 1. The numbers of elements in both array are equal.
	/// 2. `inputVectorSizes` does not have dynamic dimensions.
	/// 3. All the values in `inputVectorSizes` are greater than or equal to
	/// static sizes in `shape`.
	LogicalResult isValidMaskedInputVector(ArrayRef<int64_t> shape,
	ArrayRef<int64_t> inputVectorSizes);
	} // namespace vector

	/// Constructs a permutation map of invariant memref indices to vector
	/// dimension.
	///
	/// If no index is found to be invariant, 0 is added to the permutation_map and
	/// corresponds to a vector broadcast along that dimension.
	///
	/// The implementation uses the knowledge of the mapping of loops to
	/// vector dimension. `loopToVectorDim` carries this information as a map with:
	/// - keys representing "vectorized enclosing loops";
	/// - values representing the corresponding vector dimension.
	/// Note that loopToVectorDim is a whole function map from which only enclosing
	/// loop information is extracted.
	///
	/// Prerequisites: `indices` belong to a vectorizable load or store operation
	/// (i.e. at most one invariant index along each AffineForOp of
	/// `loopToVectorDim`). `insertPoint` is the insertion point for the vectorized
	/// load or store operation.
	///
	/// Example 1:
	/// The following MLIR snippet:
	///
	/// ```mlir
	/// affine.for %i3 = 0 to %0 {
	/// affine.for %i4 = 0 to %1 {
	/// affine.for %i5 = 0 to %2 {
	/// %a5 = load %arg0[%i4, %i5, %i3] : memref<?x?x?xf32>
	/// }}}
	/// ```
	///
	/// may vectorize with {permutation_map: (d0, d1, d2) -> (d2, d1)} into:
	///
	/// ```mlir
	/// affine.for %i3 = 0 to %0 step 32 {
	/// affine.for %i4 = 0 to %1 {
	/// affine.for %i5 = 0 to %2 step 256 {
	/// %4 = vector.transfer_read %arg0, %i4, %i5, %i3
	/// {permutation_map: (d0, d1, d2) -> (d2, d1)} :
	/// (memref<?x?x?xf32>, index, index) -> vector<32x256xf32>
	/// }}}
	/// ```
	///
	/// Meaning that vector.transfer_read will be responsible for reading the slice:
	/// `%arg0[%i4, %i5:%15+256, %i3:%i3+32]` into vector<32x256xf32>.
	///
	/// Example 2:
	/// The following MLIR snippet:
	///
	/// ```mlir
	/// %cst0 = arith.constant 0 : index
	/// affine.for %i0 = 0 to %0 {
	/// %a0 = load %arg0[%cst0, %cst0] : memref<?x?xf32>
	/// }
	/// ```
	///
	/// may vectorize with {permutation_map: (d0) -> (0)} into:
	///
	/// ```mlir
	/// affine.for %i0 = 0 to %0 step 128 {
	/// %3 = vector.transfer_read %arg0, %c0_0, %c0_0
	/// {permutation_map: (d0, d1) -> (0)} :
	/// (memref<?x?xf32>, index, index) -> vector<128xf32>
	/// }
	/// ````
	///
	/// Meaning that vector.transfer_read will be responsible of reading the slice
	/// `%arg0[%c0, %c0]` into vector<128xf32> which needs a 1-D vector broadcast.
	///
	AffineMap
	makePermutationMap(Block *insertPoint, ArrayRef<Value> indices,
	const DenseMap<Operation *, unsigned> &loopToVectorDim);
	AffineMap
	makePermutationMap(Operation *insertPoint, ArrayRef<Value> indices,
	const DenseMap<Operation *, unsigned> &loopToVectorDim);

	namespace matcher {

	/// Matches vector.transfer_read, vector.transfer_write and ops that return a
	/// vector type that is a multiple of the sub-vector type. This allows passing
	/// over other smaller vector types in the function and avoids interfering with
	/// operations on those.
	/// This is a first approximation, it can easily be extended in the future.
	/// TODO: this could all be much simpler if we added a bit that a vector type to
	/// mark that a vector is a strict super-vector but it still does not warrant
	/// adding even 1 extra bit in the IR for now.
	bool operatesOnSuperVectorsOf(Operation &op, VectorType subVectorType);

	} // namespace matcher
	} // namespace mlir

	#endif // MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_