mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp - third_party/github.com/llvm/llvm-project - Git at Google

 //===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements a pass to tile loop nests.
 //
 //===----------------------------------------------------------------------===//

 #include "PassDetail.h"
 #include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "mlir/Transforms/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 using namespace mlir;

 #define DEBUG_TYPE "affine-loop-tile"

 namespace {

 /// A pass to perform loop tiling on all suitable loop nests of a Function.
 struct LoopTiling : public AffineLoopTilingBase<LoopTiling> {
   LoopTiling() = default;
   explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true)
       : avoidMaxMinBounds(avoidMaxMinBounds) {
     this->cacheSizeInKiB = cacheSizeBytes / 1024;
   }

   void runOnFunction() override;
   void getTileSizes(ArrayRef<AffineForOp> band,
                     SmallVectorImpl<unsigned> *tileSizes);

   // Default tile size if nothing is provided.
   constexpr static unsigned kDefaultTileSize = 4;

   // If true, tile sizes are set to avoid max/min in bounds if possible.
   bool avoidMaxMinBounds = true;
 };

 } // end anonymous namespace

 /// Creates a pass to perform loop tiling on all suitable loop nests of a
 /// Function.
 std::unique_ptr<OperationPass<FuncOp>>
 mlir::createLoopTilingPass(uint64_t cacheSizeBytes) {
   return std::make_unique<LoopTiling>(cacheSizeBytes);
 }
 std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopTilingPass() {
   return std::make_unique<LoopTiling>();
 }

 // Move the loop body of AffineForOp 'src' from 'src' into the specified
 // location in destination's body, ignoring the terminator.
 static inline void moveLoopBody(AffineForOp src, AffineForOp dest,
                                 Block::iterator loc) {
   auto &insts = src.getBody()->getOperations();
   dest.getBody()->getOperations().splice(loc, insts, insts.begin(),
                                          std::prev(insts.end()));
 }

 // Move the loop body of AffineForOp 'src' from 'src' to the start of dest's
 // body.
 static inline void moveLoopBody(AffineForOp src, AffineForOp dest) {
   moveLoopBody(src, dest, dest.getBody()->begin());
 }

 /// Constructs and sets new loop bounds after tiling for the case of
 /// hyper-rectangular index sets, where the bounds of one dimension do not
 /// depend on other dimensions. Bounds of each dimension can thus be treated
 /// independently, and deriving the new bounds is much simpler and faster
 /// than for the case of tiling arbitrary polyhedral shapes.
 static void
 constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
                                 MutableArrayRef<AffineForOp> newLoops,
                                 ArrayRef<unsigned> tileSizes) {
   assert(!origLoops.empty());
   assert(origLoops.size() == tileSizes.size());

   OpBuilder b(origLoops[0].getOperation());
   unsigned width = origLoops.size();

   // Bounds for tile space loops.
   for (unsigned i = 0; i < width; i++) {
     OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
     OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
     newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
     newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
     newLoops[i].setStep(tileSizes[i]);
   }
   // Bounds for intra-tile loops.
   for (unsigned i = 0; i < width; i++) {
     int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
     auto mayBeConstantCount = getConstantTripCount(origLoops[i]);
     // The lower bound is just the tile-space loop.
     AffineMap lbMap = b.getDimIdentityMap();
     newLoops[width + i].setLowerBound(
         /*operands=*/newLoops[i].getInductionVar(), lbMap);

     // Set the upper bound.
     if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
       // Trip count is less than the tile size: upper bound is lower bound +
       // trip count.
       auto ubMap = b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue());
       newLoops[width + i].setUpperBound(
           /*operands=*/newLoops[i].getInductionVar(), ubMap);
     } else if (largestDiv % tileSizes[i] != 0) {
       // Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
       // Construct the upper bound map; the operands are the original operands
       // with 'i' (tile-space loop) appended to it. The new upper bound map is
       // the original one with an additional expression i + tileSize appended.

       // Add dim operands from original upper bound.
       SmallVector<Value, 4> ubOperands;
       auto ub = origLoops[i].getUpperBound();
       ubOperands.reserve(ub.getNumOperands() + 1);
       auto origUbMap = ub.getMap();
       for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
         ubOperands.push_back(ub.getOperand(j));

       // Add dim operand for new loop upper bound.
       ubOperands.push_back(newLoops[i].getInductionVar());

       // Add symbol operands from original upper bound.
       for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
         ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));

       SmallVector<AffineExpr, 4> boundExprs;
       boundExprs.reserve(1 + origUbMap.getNumResults());
       auto dim = b.getAffineDimExpr(origUbMap.getNumDims());
       // The new upper bound map is the original one with an additional
       // expression i + tileSize appended.
       boundExprs.push_back(dim + tileSizes[i]);
       boundExprs.append(origUbMap.getResults().begin(),
                         origUbMap.getResults().end());
       auto ubMap =
           AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
                          boundExprs, b.getContext());
       newLoops[width + i].setUpperBound(/*operands=*/ubOperands, ubMap);
     } else {
       // No need of the min expression.
       auto dim = b.getAffineDimExpr(0);
       auto ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
       newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
     }
   }
 }

 /// Tiles the specified band of perfectly nested loops creating tile-space loops
 /// and intra-tile loops. A band is a contiguous set of loops.
 //  TODO: handle non hyper-rectangular spaces.
 LogicalResult
 mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
                           ArrayRef<unsigned> tileSizes,
                           SmallVectorImpl<AffineForOp> *tiledNest) {
   // Check if the supplied for op's are all successively nested.
   assert(!input.empty() && "no loops in input band");
   assert(input.size() == tileSizes.size() && "Too few/many tile sizes");

   assert(isPerfectlyNested(input) && "input loops not perfectly nested");

   auto origLoops = input;

   AffineForOp rootAffineForOp = origLoops[0];
   auto loc = rootAffineForOp.getLoc();
   // Note that width is at least one since band isn't empty.
   unsigned width = input.size();

   SmallVector<AffineForOp, 6> tiledLoops(2 * width);

   // The outermost among the loops as we add more..
   auto *topLoop = rootAffineForOp.getOperation();
   AffineForOp innermostPointLoop;

   // Add intra-tile (or point) loops.
   for (unsigned i = 0; i < width; i++) {
     OpBuilder b(topLoop);
     // Loop bounds will be set later.
     auto pointLoop = b.create<AffineForOp>(loc, 0, 0);
     pointLoop.getBody()->getOperations().splice(
         pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
         topLoop);
     tiledLoops[2 * width - 1 - i] = pointLoop;
     topLoop = pointLoop.getOperation();
     if (i == 0)
       innermostPointLoop = pointLoop;
   }

   // Add tile space loops;
   for (unsigned i = width; i < 2 * width; i++) {
     OpBuilder b(topLoop);
     // Loop bounds will be set later.
     auto tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
     tileSpaceLoop.getBody()->getOperations().splice(
         tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
         topLoop);
     tiledLoops[2 * width - i - 1] = tileSpaceLoop;
     topLoop = tileSpaceLoop.getOperation();
   }

   // Move the loop body of the original nest to the new one.
   moveLoopBody(origLoops.back(), innermostPointLoop);

   SmallVector<Value, 8> origLoopIVs;
   extractForInductionVars(input, &origLoopIVs);

   FlatAffineConstraints cst;
   getIndexSet(input, &cst);
   if (!cst.isHyperRectangular(0, width)) {
     llvm::dbgs() << "tiled code generation unimplemented for the "
                     "non-hyperrectangular case, op:"
                  << *rootAffineForOp << "\n";
     return failure();
   }

   constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);

   // Replace original IVs with intra-tile loop IVs.
   for (unsigned i = 0; i < width; i++)
     origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());

   // Erase the old loop nest.
   rootAffineForOp.erase();

   if (tiledNest)
     *tiledNest = std::move(tiledLoops);

   return success();
 }

 // Identify valid and profitable bands of loops to tile. This is currently just
 // a temporary placeholder to test the mechanics of tiled code generation.
 // Returns all maximal outermost perfect loop nests to tile.
 static void getTileableBands(FuncOp f,
                              std::vector<SmallVector<AffineForOp, 6>> *bands) {
   // Get maximal perfect nest of 'affine.for' insts starting from root
   // (inclusive).
   auto getMaximalPerfectLoopNest = [&](AffineForOp root) {
     SmallVector<AffineForOp, 6> band;
     getPerfectlyNestedLoops(band, root);
     bands->push_back(band);
   };

   for (auto &block : f)
     for (auto &op : block)
       if (auto forOp = dyn_cast<AffineForOp>(op))
         getMaximalPerfectLoopNest(forOp);
 }

 /// Reduces each tile size to the largest divisor of the corresponding trip
 /// count (if the trip count is known).
 static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
                                          SmallVectorImpl<unsigned> *tileSizes) {
   assert(band.size() == tileSizes->size() && "invalid tile size count");
   for (unsigned i = 0, e = band.size(); i < e; i++) {
     unsigned &tSizeAdjusted = (*tileSizes)[i];
     auto mayConst = getConstantTripCount(band[i]);
     if (!mayConst)
       continue;
     // Adjust the tile size to largest factor of the trip count less than
     // tSize.
     uint64_t constTripCount = mayConst.getValue();
     if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2)
       tSizeAdjusted = constTripCount / 2;
     while (constTripCount % tSizeAdjusted != 0)
       tSizeAdjusted--;
   }
 }

 // Returns tile sizes to use. Checks CL options; if none are specified, sets it
 // based on a simple model that looks at the memory footprint and determines
 // tile sizes assuming identity accesses / 1:1 tile size proportional footprint
 // along each of the dimensions being tiled.
 // TODO: evolve this model. Tile size determination is a large area
 // to play with in general.
 void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
                               SmallVectorImpl<unsigned> *tileSizes) {
   if (band.empty())
     return;

   // Use command-line tileSize for all loops if specified.
   if (tileSize) {
     tileSizes->assign(band.size(), tileSize);
     return;
   }

   // Use tileSizes and fill them with default tile size if it's short.
   if (!this->tileSizes.empty()) {
     tileSizes->assign(this->tileSizes.begin(), this->tileSizes.end());
     tileSizes->resize(band.size(), kDefaultTileSize);
     return;
   }
   tileSizes->resize(band.size());

   // The first loop in the band.
   auto rootForOp = band[0];
   (void)rootForOp;

   // Obtain memory footprint and set tile sizes so that a tile fits in
   // the cache size. This is an approximation with the assumption that the
   // footprint increases with the tile size linearly in that dimension (i.e.,
   // assumes one-to-one access function).
   auto fp = getMemoryFootprintBytes(band[0], 0);
   if (!fp) {
     // Fill with default tile sizes if footprint is unknown.
     std::fill(tileSizes->begin(), tileSizes->end(),
               LoopTiling::kDefaultTileSize);
     if (avoidMaxMinBounds)
       adjustToDivisorsOfTripCounts(band, tileSizes);
     LLVM_DEBUG(
         rootForOp.emitWarning("memory footprint unknown: using default tile "
                               "sizes adjusted to trip count divisors"));
     return;
   }

   // Check how many times larger the cache size is when compared to footprint.
   uint64_t cacheSizeBytes = cacheSizeInKiB * 1024;
   uint64_t excessFactor = llvm::divideCeil(fp.getValue(), cacheSizeBytes);
   if (excessFactor <= 1) {
     // No need of any tiling - set tile size to 1.
     std::fill(tileSizes->begin(), tileSizes->end(), 1);
     return;
   }

   // Divide all loops equally in an attempt to reduce footprint.
   // TODO: this is approximate. Ideally, obtain reuse factor /
   // profitability along each dimension and weight tile sizes based on that as
   // one possible approach. Or compute a polynomial in tile sizes and solve for
   // it.

   // For an n-d tileable band, compute the n^th root of the excess.
   unsigned tSize =
       static_cast<unsigned>(floorl(std::pow(excessFactor, 1.0 / band.size())));
   // We'll keep a running product to determine the last tile size better.
   unsigned cumulProductOfTileSizes = 1;
   for (unsigned i = 0, e = band.size(); i < e; i++) {
     if (i < e - 1)
       (*tileSizes)[i] = tSize;
     else
       // Set last tile size to cover the balance.
       (*tileSizes)[i] = std::max(
           1U, static_cast<unsigned>(excessFactor / cumulProductOfTileSizes));
     cumulProductOfTileSizes *= (*tileSizes)[i];
   }
   if (avoidMaxMinBounds)
     adjustToDivisorsOfTripCounts(band, tileSizes);
 }

 void LoopTiling::runOnFunction() {
   // Bands of loops to tile.
   std::vector<SmallVector<AffineForOp, 6>> bands;
   getTileableBands(getFunction(), &bands);

   // Tile each band.
   for (auto &band : bands) {
     // Set up tile sizes; fill missing tile sizes at the end with default tile
     // size or tileSize if one was provided.
     SmallVector<unsigned, 6> tileSizes;
     getTileSizes(band, &tileSizes);
     if (llvm::DebugFlag) {
       auto diag = band[0].emitRemark("using tile sizes [");
       for (auto tSize : tileSizes)
         diag << tSize << ' ';
       diag << "]\n";
     }
     SmallVector<AffineForOp, 6> tiledNest;
     if (failed(tilePerfectlyNested(band, tileSizes, &tiledNest)))
       return signalPassFailure();

     // Separate full and partial tiles.
     if (separate) {
       auto intraTileLoops =
           MutableArrayRef<AffineForOp>(tiledNest).drop_front(band.size());
       separateFullTiles(intraTileLoops);
     }
   }
 }

 constexpr unsigned LoopTiling::kDefaultTileSize;
	//===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements a pass to tile loop nests.
	//
	//===----------------------------------------------------------------------===//

	#include "PassDetail.h"
	#include "mlir/Analysis/AffineAnalysis.h"
	#include "mlir/Analysis/AffineStructures.h"
	#include "mlir/Analysis/LoopAnalysis.h"
	#include "mlir/Analysis/Utils.h"
	#include "mlir/Dialect/Affine/IR/AffineOps.h"
	#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
	#include "mlir/Dialect/Affine/Passes.h"
	#include "mlir/IR/BlockAndValueMapping.h"
	#include "mlir/IR/Builders.h"
	#include "mlir/Transforms/LoopUtils.h"
	#include "mlir/Transforms/Utils.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	using namespace mlir;

	#define DEBUG_TYPE "affine-loop-tile"

	namespace {

	/// A pass to perform loop tiling on all suitable loop nests of a Function.
	struct LoopTiling : public AffineLoopTilingBase<LoopTiling> {
	LoopTiling() = default;
	explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true)
	: avoidMaxMinBounds(avoidMaxMinBounds) {
	this->cacheSizeInKiB = cacheSizeBytes / 1024;
	}

	void runOnFunction() override;
	void getTileSizes(ArrayRef<AffineForOp> band,
	SmallVectorImpl<unsigned> *tileSizes);

	// Default tile size if nothing is provided.
	constexpr static unsigned kDefaultTileSize = 4;

	// If true, tile sizes are set to avoid max/min in bounds if possible.
	bool avoidMaxMinBounds = true;
	};

	} // end anonymous namespace

	/// Creates a pass to perform loop tiling on all suitable loop nests of a
	/// Function.
	std::unique_ptr<OperationPass<FuncOp>>
	mlir::createLoopTilingPass(uint64_t cacheSizeBytes) {
	return std::make_unique<LoopTiling>(cacheSizeBytes);
	}
	std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopTilingPass() {
	return std::make_unique<LoopTiling>();
	}

	// Move the loop body of AffineForOp 'src' from 'src' into the specified
	// location in destination's body, ignoring the terminator.
	static inline void moveLoopBody(AffineForOp src, AffineForOp dest,
	Block::iterator loc) {
	auto &insts = src.getBody()->getOperations();
	dest.getBody()->getOperations().splice(loc, insts, insts.begin(),
	std::prev(insts.end()));
	}

	// Move the loop body of AffineForOp 'src' from 'src' to the start of dest's
	// body.
	static inline void moveLoopBody(AffineForOp src, AffineForOp dest) {
	moveLoopBody(src, dest, dest.getBody()->begin());
	}

	/// Constructs and sets new loop bounds after tiling for the case of
	/// hyper-rectangular index sets, where the bounds of one dimension do not
	/// depend on other dimensions. Bounds of each dimension can thus be treated
	/// independently, and deriving the new bounds is much simpler and faster
	/// than for the case of tiling arbitrary polyhedral shapes.
	static void
	constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
	MutableArrayRef<AffineForOp> newLoops,
	ArrayRef<unsigned> tileSizes) {
	assert(!origLoops.empty());
	assert(origLoops.size() == tileSizes.size());

	OpBuilder b(origLoops[0].getOperation());
	unsigned width = origLoops.size();

	// Bounds for tile space loops.
	for (unsigned i = 0; i < width; i++) {
	OperandRange newLbOperands = origLoops[i].getLowerBoundOperands();
	OperandRange newUbOperands = origLoops[i].getUpperBoundOperands();
	newLoops[i].setLowerBound(newLbOperands, origLoops[i].getLowerBoundMap());
	newLoops[i].setUpperBound(newUbOperands, origLoops[i].getUpperBoundMap());
	newLoops[i].setStep(tileSizes[i]);
	}
	// Bounds for intra-tile loops.
	for (unsigned i = 0; i < width; i++) {
	int64_t largestDiv = getLargestDivisorOfTripCount(origLoops[i]);
	auto mayBeConstantCount = getConstantTripCount(origLoops[i]);
	// The lower bound is just the tile-space loop.
	AffineMap lbMap = b.getDimIdentityMap();
	newLoops[width + i].setLowerBound(
	/operands=/newLoops[i].getInductionVar(), lbMap);

	// Set the upper bound.
	if (mayBeConstantCount && mayBeConstantCount.getValue() < tileSizes[i]) {
	// Trip count is less than the tile size: upper bound is lower bound +
	// trip count.
	auto ubMap = b.getSingleDimShiftAffineMap(mayBeConstantCount.getValue());
	newLoops[width + i].setUpperBound(
	/operands=/newLoops[i].getInductionVar(), ubMap);
	} else if (largestDiv % tileSizes[i] != 0) {
	// Intra-tile loop ii goes from i to min(i + tileSize, ub_i).
	// Construct the upper bound map; the operands are the original operands
	// with 'i' (tile-space loop) appended to it. The new upper bound map is
	// the original one with an additional expression i + tileSize appended.

	// Add dim operands from original upper bound.
	SmallVector<Value, 4> ubOperands;
	auto ub = origLoops[i].getUpperBound();
	ubOperands.reserve(ub.getNumOperands() + 1);
	auto origUbMap = ub.getMap();
	for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j)
	ubOperands.push_back(ub.getOperand(j));

	// Add dim operand for new loop upper bound.
	ubOperands.push_back(newLoops[i].getInductionVar());

	// Add symbol operands from original upper bound.
	for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j)
	ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j));

	SmallVector<AffineExpr, 4> boundExprs;
	boundExprs.reserve(1 + origUbMap.getNumResults());
	auto dim = b.getAffineDimExpr(origUbMap.getNumDims());
	// The new upper bound map is the original one with an additional
	// expression i + tileSize appended.
	boundExprs.push_back(dim + tileSizes[i]);
	boundExprs.append(origUbMap.getResults().begin(),
	origUbMap.getResults().end());
	auto ubMap =
	AffineMap::get(origUbMap.getNumDims() + 1, origUbMap.getNumSymbols(),
	boundExprs, b.getContext());
	newLoops[width + i].setUpperBound(/operands=/ubOperands, ubMap);
	} else {
	// No need of the min expression.
	auto dim = b.getAffineDimExpr(0);
	auto ubMap = AffineMap::get(1, 0, dim + tileSizes[i]);
	newLoops[width + i].setUpperBound(newLoops[i].getInductionVar(), ubMap);
	}
	}
	}

	/// Tiles the specified band of perfectly nested loops creating tile-space loops
	/// and intra-tile loops. A band is a contiguous set of loops.
	// TODO: handle non hyper-rectangular spaces.
	LogicalResult
	mlir::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
	ArrayRef<unsigned> tileSizes,
	SmallVectorImpl<AffineForOp> *tiledNest) {
	// Check if the supplied for op's are all successively nested.
	assert(!input.empty() && "no loops in input band");
	assert(input.size() == tileSizes.size() && "Too few/many tile sizes");

	assert(isPerfectlyNested(input) && "input loops not perfectly nested");

	auto origLoops = input;

	AffineForOp rootAffineForOp = origLoops[0];
	auto loc = rootAffineForOp.getLoc();
	// Note that width is at least one since band isn't empty.
	unsigned width = input.size();

	SmallVector<AffineForOp, 6> tiledLoops(2 * width);

	// The outermost among the loops as we add more..
	auto *topLoop = rootAffineForOp.getOperation();
	AffineForOp innermostPointLoop;

	// Add intra-tile (or point) loops.
	for (unsigned i = 0; i < width; i++) {
	OpBuilder b(topLoop);
	// Loop bounds will be set later.
	auto pointLoop = b.create<AffineForOp>(loc, 0, 0);
	pointLoop.getBody()->getOperations().splice(
	pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
	topLoop);
	tiledLoops[2 * width - 1 - i] = pointLoop;
	topLoop = pointLoop.getOperation();
	if (i == 0)
	innermostPointLoop = pointLoop;
	}

	// Add tile space loops;
	for (unsigned i = width; i < 2 * width; i++) {
	OpBuilder b(topLoop);
	// Loop bounds will be set later.
	auto tileSpaceLoop = b.create<AffineForOp>(loc, 0, 0);
	tileSpaceLoop.getBody()->getOperations().splice(
	tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
	topLoop);
	tiledLoops[2 * width - i - 1] = tileSpaceLoop;
	topLoop = tileSpaceLoop.getOperation();
	}

	// Move the loop body of the original nest to the new one.
	moveLoopBody(origLoops.back(), innermostPointLoop);

	SmallVector<Value, 8> origLoopIVs;
	extractForInductionVars(input, &origLoopIVs);

	FlatAffineConstraints cst;
	getIndexSet(input, &cst);
	if (!cst.isHyperRectangular(0, width)) {
	llvm::dbgs() << "tiled code generation unimplemented for the "
	"non-hyperrectangular case, op:"
	<< *rootAffineForOp << "\n";
	return failure();
	}

	constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes);

	// Replace original IVs with intra-tile loop IVs.
	for (unsigned i = 0; i < width; i++)
	origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar());

	// Erase the old loop nest.
	rootAffineForOp.erase();

	if (tiledNest)
	*tiledNest = std::move(tiledLoops);

	return success();
	}

	// Identify valid and profitable bands of loops to tile. This is currently just
	// a temporary placeholder to test the mechanics of tiled code generation.
	// Returns all maximal outermost perfect loop nests to tile.
	static void getTileableBands(FuncOp f,
	std::vector<SmallVector<AffineForOp, 6>> *bands) {
	// Get maximal perfect nest of 'affine.for' insts starting from root
	// (inclusive).
	auto getMaximalPerfectLoopNest = [&](AffineForOp root) {
	SmallVector<AffineForOp, 6> band;
	getPerfectlyNestedLoops(band, root);
	bands->push_back(band);
	};

	for (auto &block : f)
	for (auto &op : block)
	if (auto forOp = dyn_cast<AffineForOp>(op))
	getMaximalPerfectLoopNest(forOp);
	}

	/// Reduces each tile size to the largest divisor of the corresponding trip
	/// count (if the trip count is known).
	static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band,
	SmallVectorImpl<unsigned> *tileSizes) {
	assert(band.size() == tileSizes->size() && "invalid tile size count");
	for (unsigned i = 0, e = band.size(); i < e; i++) {
	unsigned &tSizeAdjusted = (*tileSizes)[i];
	auto mayConst = getConstantTripCount(band[i]);
	if (!mayConst)
	continue;
	// Adjust the tile size to largest factor of the trip count less than
	// tSize.
	uint64_t constTripCount = mayConst.getValue();
	if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2)
	tSizeAdjusted = constTripCount / 2;
	while (constTripCount % tSizeAdjusted != 0)
	tSizeAdjusted--;
	}
	}

	// Returns tile sizes to use. Checks CL options; if none are specified, sets it
	// based on a simple model that looks at the memory footprint and determines
	// tile sizes assuming identity accesses / 1:1 tile size proportional footprint
	// along each of the dimensions being tiled.
	// TODO: evolve this model. Tile size determination is a large area
	// to play with in general.
	void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
	SmallVectorImpl<unsigned> *tileSizes) {
	if (band.empty())
	return;

	// Use command-line tileSize for all loops if specified.
	if (tileSize) {
	tileSizes->assign(band.size(), tileSize);
	return;
	}

	// Use tileSizes and fill them with default tile size if it's short.
	if (!this->tileSizes.empty()) {
	tileSizes->assign(this->tileSizes.begin(), this->tileSizes.end());
	tileSizes->resize(band.size(), kDefaultTileSize);
	return;
	}
	tileSizes->resize(band.size());

	// The first loop in the band.
	auto rootForOp = band[0];
	(void)rootForOp;

	// Obtain memory footprint and set tile sizes so that a tile fits in
	// the cache size. This is an approximation with the assumption that the
	// footprint increases with the tile size linearly in that dimension (i.e.,
	// assumes one-to-one access function).
	auto fp = getMemoryFootprintBytes(band[0], 0);
	if (!fp) {
	// Fill with default tile sizes if footprint is unknown.
	std::fill(tileSizes->begin(), tileSizes->end(),
	LoopTiling::kDefaultTileSize);
	if (avoidMaxMinBounds)
	adjustToDivisorsOfTripCounts(band, tileSizes);
	LLVM_DEBUG(
	rootForOp.emitWarning("memory footprint unknown: using default tile "
	"sizes adjusted to trip count divisors"));
	return;
	}

	// Check how many times larger the cache size is when compared to footprint.
	uint64_t cacheSizeBytes = cacheSizeInKiB * 1024;
	uint64_t excessFactor = llvm::divideCeil(fp.getValue(), cacheSizeBytes);
	if (excessFactor <= 1) {
	// No need of any tiling - set tile size to 1.
	std::fill(tileSizes->begin(), tileSizes->end(), 1);
	return;
	}

	// Divide all loops equally in an attempt to reduce footprint.
	// TODO: this is approximate. Ideally, obtain reuse factor /
	// profitability along each dimension and weight tile sizes based on that as
	// one possible approach. Or compute a polynomial in tile sizes and solve for
	// it.

	// For an n-d tileable band, compute the n^th root of the excess.
	unsigned tSize =
	static_cast<unsigned>(floorl(std::pow(excessFactor, 1.0 / band.size())));
	// We'll keep a running product to determine the last tile size better.
	unsigned cumulProductOfTileSizes = 1;
	for (unsigned i = 0, e = band.size(); i < e; i++) {
	if (i < e - 1)
	(*tileSizes)[i] = tSize;
	else
	// Set last tile size to cover the balance.
	(*tileSizes)[i] = std::max(
	1U, static_cast<unsigned>(excessFactor / cumulProductOfTileSizes));
	cumulProductOfTileSizes = (tileSizes)[i];
	}
	if (avoidMaxMinBounds)
	adjustToDivisorsOfTripCounts(band, tileSizes);
	}

	void LoopTiling::runOnFunction() {
	// Bands of loops to tile.
	std::vector<SmallVector<AffineForOp, 6>> bands;
	getTileableBands(getFunction(), &bands);

	// Tile each band.
	for (auto &band : bands) {
	// Set up tile sizes; fill missing tile sizes at the end with default tile
	// size or tileSize if one was provided.
	SmallVector<unsigned, 6> tileSizes;
	getTileSizes(band, &tileSizes);
	if (llvm::DebugFlag) {
	auto diag = band[0].emitRemark("using tile sizes [");
	for (auto tSize : tileSizes)
	diag << tSize << ' ';
	diag << "]\n";
	}
	SmallVector<AffineForOp, 6> tiledNest;
	if (failed(tilePerfectlyNested(band, tileSizes, &tiledNest)))
	return signalPassFailure();

	// Separate full and partial tiles.
	if (separate) {
	auto intraTileLoops =
	MutableArrayRef<AffineForOp>(tiledNest).drop_front(band.size());
	separateFullTiles(intraTileLoops);
	}
	}
	}

	constexpr unsigned LoopTiling::kDefaultTileSize;