Diff 480213

mlir/include/mlir/Dialect/Affine/Utils.h

Show First 20 Lines • Show All 353 Lines • ▼ Show 20 Lines	OpFoldResult add(AffineValueExpr lhs, AffineValueExpr rhs) {
return makeComposedFoldedAffineApply(b, loc, {lhs.e + rhs.e}, {lhs, rhs});		return makeComposedFoldedAffineApply(b, loc, {lhs.e + rhs.e}, {lhs, rhs});
}		}
OpFoldResult sub(AffineValueExpr lhs, AffineValueExpr rhs) {		OpFoldResult sub(AffineValueExpr lhs, AffineValueExpr rhs) {
return makeComposedFoldedAffineApply(b, loc, {lhs.e - rhs.e}, {lhs, rhs});		return makeComposedFoldedAffineApply(b, loc, {lhs.e - rhs.e}, {lhs, rhs});
}		}
OpFoldResult mul(AffineValueExpr lhs, AffineValueExpr rhs) {		OpFoldResult mul(AffineValueExpr lhs, AffineValueExpr rhs) {
return makeComposedFoldedAffineApply(b, loc, {lhs.e * rhs.e}, {lhs, rhs});		return makeComposedFoldedAffineApply(b, loc, {lhs.e * rhs.e}, {lhs, rhs});
}		}
		OpFoldResult floor(AffineValueExpr lhs, AffineValueExpr rhs) {
		return makeComposedFoldedAffineApply(b, loc, {lhs.e.floorDiv(rhs.e)},
		{lhs, rhs});
		}
OpFoldResult ceil(AffineValueExpr lhs, AffineValueExpr rhs) {		OpFoldResult ceil(AffineValueExpr lhs, AffineValueExpr rhs) {
return makeComposedFoldedAffineApply(b, loc, {lhs.e.ceilDiv(rhs.e)},		return makeComposedFoldedAffineApply(b, loc, {lhs.e.ceilDiv(rhs.e)},
{lhs, rhs});		{lhs, rhs});
}		}
OpFoldResult min(ArrayRef<OpFoldResult> vals) {		OpFoldResult min(ArrayRef<OpFoldResult> vals) {
return makeComposedFoldedAffineMin(		return makeComposedFoldedAffineMin(
b, loc, AffineMap::getMultiDimIdentityMap(vals.size(), b.getContext()),		b, loc, AffineMap::getMultiDimIdentityMap(vals.size(), b.getContext()),
vals);		vals);
Show All 15 Lines

mlir/lib/Dialect/Tensor/IR/CMakeLists.txt

	Show First 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
	add_mlir_dialect_library(MLIRTensorTilingInterfaceImpl			add_mlir_dialect_library(MLIRTensorTilingInterfaceImpl
	TensorTilingInterfaceImpl.cpp			TensorTilingInterfaceImpl.cpp

	ADDITIONAL_HEADER_DIRS			ADDITIONAL_HEADER_DIRS
	${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tensor			${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tensor

	LINK_LIBS PUBLIC			LINK_LIBS PUBLIC
	MLIRAffineDialect			MLIRAffineDialect
				MLIRAffineUtils
	MLIRDialectUtils			MLIRDialectUtils
	MLIRIR			MLIRIR
	MLIRLinalgDialect			MLIRLinalgDialect
				MLIRLinalgUtils
	MLIRSCFDialect			MLIRSCFDialect
	MLIRSupport			MLIRSupport
	MLIRTensorDialect			MLIRTensorDialect
	MLIRTensorUtils			MLIRTensorUtils
	MLIRTilingInterface			MLIRTilingInterface
	)			)

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp

//===- TensorTilingInterface.cpp - Tiling Interface models *- C++ ------*-===//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"

#include "mlir/Dialect/Affine/Utils.h"

#include "mlir/Dialect/Arith/Utils/Utils.h"

#include "mlir/Dialect/Linalg/IR/Linalg.h"

#include "mlir/Dialect/Linalg/Utils/Utils.h"

#include "mlir/Dialect/SCF/IR/SCF.h"

#include "mlir/Dialect/Tensor/IR/Tensor.h"

#include "mlir/Dialect/Tensor/Utils/Utils.h"

#include "mlir/Dialect/Utils/IndexingUtils.h"

#include "mlir/Interfaces/TilingInterface.h"

using namespace mlir;

using namespace mlir::tensor;

▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines

getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,

SmallVector<OpFoldResult> &resultOffsets,

SmallVector<OpFoldResult> &resultSizes) const {

resultOffsets.assign(offsets.begin(), offsets.end());

resultSizes.assign(sizes.begin(), sizes.end());

return success();

}

};

template <typename OpTy>

static SmallVector<Range> getPackUnPackIterationDomain(OpTy op,

OpBuilder &builder) {

static_assert(llvm::is_one_of<OpTy, PackOp, UnPackOp>::value,

"applies to only pack or unpack operations");

OpBuilder::InsertionGuard g(builder);

Location loc = op.getLoc();

int64_t rank = (std::is_same<OpTy, PackOp>::value) ? op.getSourceRank()

: op.getDestRank();

Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);

Value one = builder.create<arith::ConstantIndexOp>(loc, 1);

ReifiedRankedShapedTypeDims resultShape;

(void)op.reifyResultShapes(builder, resultShape);

SmallVector<Range> loopBounds(rank);

for (auto dim : llvm::seq<int64_t>(0, rank)) {

loopBounds[dim].offset = zero;

loopBounds[dim].stride = one;

loopBounds[dim].size = resultShape[0][dim];

}

return loopBounds;

}

static void applyInversePermToRange(SmallVector<OpFoldResult> &offsets,

SmallVector<OpFoldResult> &sizes,

ArrayRef<int64_t> permutation) {

if (permutation.empty())

return;

SmallVector<int64_t> inversedPerm = invertPermutationVector(permutation);

applyPermutationToVector<OpFoldResult>(offsets, inversedPerm);

applyPermutationToVector<OpFoldResult>(sizes, inversedPerm);

}

struct PackOpTiling

: public TilingInterface::ExternalModel<PackOpTiling, PackOp> {

SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {

// Note that here we only consider untiled dimensions and outer tiled data

// dimensions, the inner tiled data dimensions are materialized when

// building the body of the operation.

auto packOp = cast<PackOp>(op);

SmallVector<utils::IteratorType> iteratorTypes(

packOp.getSourceRank(), utils::IteratorType::parallel);

return iteratorTypes;

}

SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {

OpBuilder::InsertionGuard guard(b);

return getPackUnPackIterationDomain<PackOp>(cast<PackOp>(op), b);

auto packOp = cast<PackOp>(op);

Location loc = packOp.getLoc();

int64_t rank = packOp.getSourceRank();

Value zero = b.create<arith::ConstantIndexOp>(loc, 0);

Value one = b.create<arith::ConstantIndexOp>(loc, 1);

ReifiedRankedShapedTypeDims resultShape;

(void)packOp.reifyResultShapes(b, resultShape);

SmallVector<Range> loopRanges(rank);

for (auto dim : llvm::seq<int64_t>(0, rank)) {

loopRanges[dim].offset = zero;

loopRanges[dim].stride = one;

loopRanges[dim].size = resultShape[0][dim];

}

return loopRanges;

}

SmallVector<Operation *>

getTiledImplementation(Operation *op, OpBuilder &b,

ArrayRef<OpFoldResult> offsets,

ArrayRef<OpFoldResult> sizes) const {

auto packOp = cast<PackOp>(op);

Location loc = packOp.getLoc();

// The tiling is applied on interchanged dimensions. We have to undo the

// interchange to map sizes and offsets to the original input.

int64_t inputRank = packOp.getSourceRank();

ArrayRef<int64_t> dimsToOuterBlock(packOp.getOuterDimsPerm());

SmallVector<OpFoldResult> origOffsets(offsets.begin(), offsets.end());

SmallVector<OpFoldResult> origSizes(sizes.begin(), sizes.end());

if (!dimsToOuterBlock.empty()) {

applyInversePermToRange(origOffsets, origSizes, packOp.getOuterDimsPerm());

SmallVector<int64_t> inversedPerm =

invertPermutationVector(dimsToOuterBlock);

applyPermutationToVector<OpFoldResult>(origOffsets, inversedPerm);

applyPermutationToVector<OpFoldResult>(origSizes, inversedPerm);

}

DenseMap<int64_t, OpFoldResult> dimAndTileMapping =

packOp.getDimAndTileMapping();

SmallVector<OpFoldResult> srcDimValues =

tensor::createDimValues(b, loc, packOp.getSource());

SmallVector<OpFoldResult> inputIndices, inputSizes;

for (auto dim : llvm::seq<int64_t>(0, inputRank)) {

using AV = AffineValueExpr;

▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines

getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,

resultSizes.assign(sizes.begin(), sizes.end());

for (auto dataTileDim : llvm::seq<unsigned>(inputRank, outputRank))

resultSizes.push_back(getAsOpFoldResult(outputShape[0][dataTileDim]));

return success();

}

};

struct UnpackTileDimInfo {

bool isAlignedToInnerTileSize;

OpFoldResult sourceOffset;

OpFoldResult sourceSize;

OpFoldResult resultOffset;

OpFoldResult destExpandedSize;

};

static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp,

cheliniUnsubmitted

Not Done

Can you please document this method?

chelini: Can you please document this method?

hanchungAuthorUnsubmitted

Done

I added more documents in getTiledImplemetation and route the "needed information" to it. Please take a look, thanks!

hanchung: I added more documents in getTiledImplemetation and route the "needed information" to it.

int64_t tileDim,

OpFoldResult tileOffset,

OpFoldResult tileSize) {

UnpackTileDimInfo info;

Attribute zeroAttr = b.getIndexAttr(0);

Attribute oneAttr = b.getIndexAttr(1);

DenseMap<int64_t, OpFoldResult> dimAndTileMapping =

unpackOp.getDimAndTileMapping();

// The dimension is not one of packed data dimension.

if (!dimAndTileMapping.count(tileDim)) {

info.isAlignedToInnerTileSize = true;

info.sourceOffset = tileOffset;

info.sourceSize = tileSize;

info.resultOffset = zeroAttr;

info.destExpandedSize = tileSize;

return info;

}

Location loc = unpackOp.getLoc();

using AV = AffineValueExpr;

AffineBuilder ab(b, loc);

AffineExpr dim0, dim1, sym0;

bindDims(b.getContext(), dim0, dim1);

bindSymbols(b.getContext(), sym0);

OpFoldResult innerTileSize = dimAndTileMapping[tileDim];

info.isAlignedToInnerTileSize = false;

FailureOr<int64_t> cstSize = linalg::getConstantUpperBoundForIndex(

getValueOrCreateConstantIndexOp(b, loc, tileSize));

Optional<int64_t> cstInnerSize = getConstantIntValue(innerTileSize);

if (!failed(cstSize) && cstInnerSize) {

if (cstSize.value() % cstInnerSize.value() == 0)

info.isAlignedToInnerTileSize = true;

// If the tiling size equals to the inner tiling size, the outer dims are

// always 1.

if (cstInnerSize.value() == cstSize.value()) {

auto lhs = AV(dim0).bind(tileOffset);

auto rhs = AV(dim1).bind(innerTileSize);

info.sourceOffset = ab.floor(lhs, rhs);

info.sourceSize = oneAttr;

info.resultOffset = zeroAttr;

info.destExpandedSize = tileSize;

return info;

}

if (info.isAlignedToInnerTileSize) {

info.sourceOffset =

ab.floor(AV(dim0).bind(tileOffset), AV(dim1).bind(innerTileSize));

info.resultOffset = zeroAttr;

info.destExpandedSize = tileSize;

// The ceilDiv is needed here because there could be incomplete tile even

// it is perfect tiling cases. E.g.,

// %0 = unpack tensor<33x2xf32> into tensor<64xf32>

// If the tiling size is 32, there will be three tiles. Two of them have

cheliniUnsubmitted

Done

// %0 = unpack tensor<33x2xf32> into tensor<64xf32>

- // If the tiling size is 32, there will be three tiles. Two of them have

+ // If the tiling size is 32, there will be 3 tiles. Two of them have

// size=32; one of them have size=2. The size is represented using

chelini:

// size=32; one of them have size=2. The size is represented using

// affine_min op; we need ceilDiv.

info.sourceSize =

ab.ceil(AV(dim0).bind(tileSize), AV(dim1).bind(innerTileSize));

return info;

}

DivModValue firstCoord =

getDivMod(b, loc, getValueOrCreateConstantIndexOp(b, loc, tileOffset),

getValueOrCreateConstantIndexOp(b, loc, innerTileSize));

OpFoldResult tileExclusiveBound =

ab.add(AV(dim0).bind(tileOffset), AV(dim1).bind(tileSize));

DivModValue lastCoord = getDivMod(

b, loc,

getValueOrCreateConstantIndexOp(

b, loc,

ab.sub(AV(dim0).bind(tileExclusiveBound), AV(dim1).bind(oneAttr))),

getValueOrCreateConstantIndexOp(b, loc, innerTileSize));

OpFoldResult lengthMinusOne = ab.sub(AV(dim0).bind(lastCoord.quotient),

AV(dim1).bind(firstCoord.quotient));

info.sourceSize =

ab.add(AV(dim0).bind(lengthMinusOne), AV(dim1).bind(oneAttr));

info.sourceOffset = firstCoord.quotient;

info.resultOffset = firstCoord.remainder;

info.destExpandedSize =

ab.mul(AV(dim0).bind(info.sourceSize), AV(sym0).bind(innerTileSize));

return info;

}

struct UnPackOpTiling

: public TilingInterface::ExternalModel<UnPackOpTiling, UnPackOp> {

SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {

auto unpackOp = cast<UnPackOp>(op);

SmallVector<utils::IteratorType> iteratorTypes(

unpackOp.getDestRank(), utils::IteratorType::parallel);

return iteratorTypes;

}

SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {

return getPackUnPackIterationDomain<UnPackOp>(cast<UnPackOp>(op), b);

}

SmallVector<Operation *>

getTiledImplementation(Operation *op, OpBuilder &b,

ArrayRef<OpFoldResult> offsets,

ArrayRef<OpFoldResult> sizes) const {

auto unpackOp = cast<UnPackOp>(op);

int64_t srcRank = unpackOp.getSourceRank();

int64_t destRank = unpackOp.getDestRank();

int64_t numInnerTiles = srcRank - destRank;

Location loc = unpackOp.getLoc();

// The perfect tiling case indicates that the tiling sizes is are multiple

cheliniUnsubmitted

Done

Location loc = unpackOp.getLoc();

- // The perfect tiling case indicates that the tiling sizes is are multiple

+ // The perfect tiling case indicates that the tiling sizes are multiple

// of inner_tile_size. In this context, The indices of input slice are all

chelini:

// of inner_tile_size. In this context, The indices of input slice are all

cheliniUnsubmitted

Done

// The perfect tiling case indicates that the tiling sizes is are multiple

- // of inner_tile_size. In this context, The indices of input slice are all

+ // of inner_tile_size. In this context, the indices of input slice are all

// aligned to head. No extra data is needed when representing the tiled

chelini:

// aligned to head. No extra data is needed when representing the tiled

cheliniUnsubmitted

Done

what is head?

chelini: what is head?

// unpack op.

bool isPerfectTilingCase = true;

Attribute oneAttr = b.getIndexAttr(1);

SmallVector<OpFoldResult> sliceSrcStrides(destRank, oneAttr);

SmallVector<OpFoldResult> sliceSrcIndices, sliceSrcSizes;

SmallVector<OpFoldResult> destExpandedSizes, resultOffsetsFromDest;

for (auto dim : llvm::seq<int64_t>(0, destRank)) {

UnpackTileDimInfo info =

getUnpackTileDimInfo(b, unpackOp, dim, offsets[dim], sizes[dim]);

if (!info.isAlignedToInnerTileSize)

isPerfectTilingCase = false;

sliceSrcIndices.push_back(info.sourceOffset);

sliceSrcSizes.push_back(info.sourceSize);

destExpandedSizes.push_back(info.destExpandedSize);

resultOffsetsFromDest.push_back(info.resultOffset);

}

applyInversePermToRange(sliceSrcIndices, sliceSrcSizes,

cheliniUnsubmitted

Done

Can you please document why we are doing this?

chelini: Can you please document why we are doing this?

unpackOp.getOuterDimsPerm());

Attribute zeroAttr = b.getIndexAttr(0);

sliceSrcIndices.append(numInnerTiles, zeroAttr);

sliceSrcSizes.append(unpackOp.getMixedTiles());

sliceSrcStrides.append(numInnerTiles, oneAttr);

Value sliceSource =

b.create<ExtractSliceOp>(loc, unpackOp.getSource(), sliceSrcIndices,

sliceSrcSizes, sliceSrcStrides);

SmallVector<OpFoldResult> destStrides(destRank, oneAttr);

Value sliceDest;

if (isPerfectTilingCase) {

sliceDest = b.create<ExtractSliceOp>(loc, unpackOp.getDest(), offsets,

sizes, destStrides);

} else {

sliceDest = b.create<EmptyOp>(loc, destExpandedSizes,

unpackOp.getDestType().getElementType());

}

Operation *tiledUnpackOp =

b.create<UnPackOp>(loc, TypeRange{sliceDest.getType()},

ValueRange{sliceSource, sliceDest}, op->getAttrs());

if (isPerfectTilingCase)

return {tiledUnpackOp};

Operation *extractSlice =

b.create<ExtractSliceOp>(loc, tiledUnpackOp->getResult(0),

resultOffsetsFromDest, sizes, destStrides);

return {tiledUnpackOp, extractSlice};

}

LogicalResult

getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,

ArrayRef<OpFoldResult> offsets,

ArrayRef<OpFoldResult> sizes,

SmallVector<OpFoldResult> &resultOffsets,

SmallVector<OpFoldResult> &resultSizes) const {

resultOffsets = llvm::to_vector(offsets);

resultSizes = llvm::to_vector(sizes);

return success();

}

};

} // namespace

Operation *tensor::bubbleUpPadSlice(OpBuilder &b, tensor::PadOp padOp,

ArrayRef<OpFoldResult> offsets,

ArrayRef<OpFoldResult> sizes,

bool generateZeroSliceGuard) {

// Only constant padding value supported.

Value padValue = padOp.getConstantPaddingValue();

▲ Show 20 Lines • Show All 199 Lines • ▼ Show 20 Lines

Operation *tensor::bubbleUpPadSlice(OpBuilder &b, tensor::PadOp padOp,

return createPadOfExtractSlice();

}

void mlir::tensor::registerTilingInterfaceExternalModels(

DialectRegistry &registry) {

registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) {

tensor::PadOp::attachInterface<PadOpTiling>(*ctx);

tensor::PackOp::attachInterface<PackOpTiling>(*ctx);

tensor::UnPackOp::attachInterface<UnPackOpTiling>(*ctx);

});

}

mlir/test/Dialect/Tensor/tiling.mlir

Show First 20 Lines • Show All 206 Lines • ▼ Show 20 Lines	func.func @pad_and_pack_fully_dynamic(%source: tensor<?x?xf32>, %dest: tensor<?x?x?x?xf32>, %pad: f32, %tile_n : index, %tile_m : index) -> tensor<?x?x?x?xf32> {
return %0 : tensor<?x?x?x?xf32>		return %0 : tensor<?x?x?x?xf32>
}		}

transform.sequence failures(propagate) {		transform.sequence failures(propagate) {
^bb0(%arg1: !pdl.operation):		^bb0(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["tensor.pack"]} in %arg1		%0 = transform.structured.match ops{["tensor.pack"]} in %arg1
%1, %loops:2 = transform.structured.tile_to_scf_for %0 [2, 4]		%1, %loops:2 = transform.structured.tile_to_scf_for %0 [2, 4]
}		}

		// -----

		// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)>
		// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)>
		// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)>
		// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0) -> (((d0 + 1) floordiv 32) * 32 - (d0 floordiv 32) * 32 + 32)>
		// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 16)>
		// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 16)>
		// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 16 - d0 floordiv 16 + 1)>
		// CHECK-DAG: #[[MAP7:.+]] = affine_map<(d0) -> (((d0 + 3) floordiv 16) * 16 - (d0 floordiv 16) * 16 + 16)>
		// CHECK: func.func @NCnc_to_NC
		// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]:
		// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]:
		// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
		// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
		// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
		// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
		// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index
		// CHECK: %{{.+}} = scf.for %[[I:.+]] = %[[C0]] to %[[C256]] step %[[C2]]
		// CHECK: %{{.+}} = scf.for %[[J:.+]] = %[[C0]] to %[[C128]] step %[[C4]]
		// CHECK-DAG: %[[IN_I:.+]] = affine.apply #[[MAP0]](%[[I]])
		// CHECK-DAG: %[[OFFSET_I:.+]] = affine.apply #[[MAP1]](%[[I]])
		// CHECK-DAG: %[[IN_I_SZ:.+]] = affine.apply #[[MAP2]](%[[I]])
		// CHECK-DAG: %[[IN_J:.+]] = affine.apply #[[MAP4]](%[[J]])
		// CHECK-DAG: %[[OFFSET_J:.+]] = affine.apply #[[MAP5]](%[[J]])
		// CHECK-DAG: %[[IN_J_SZ:.+]] = affine.apply #[[MAP6]](%[[J]])
		// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[IN]]
		// CHECK-SAME: [%[[IN_I]], %[[IN_J]], 0, 0] [%[[IN_I_SZ]], %[[IN_J_SZ]], 32, 16]
		// CHECK-SAME: : tensor<8x8x32x16xf32> to tensor<?x?x32x16xf32>
		// CHECK: %[[EMPTY:.+]] = tensor.empty
		// CHECK: %[[UNPACK:.+]] = tensor.unpack
		// CHECK-SAME: %[[SLICE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16]
		// CHECK-SAME: into %[[EMPTY]]
		// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]]
		// CHECK-SAME: [%[[OFFSET_I]], %[[OFFSET_J]]] [2, 4]
		// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]]
		// CHECK-SAME: into %{{.+}}[%[[I]], %[[J]]] [2, 4]
		// CHECK: scf.yield %[[RES]]
		func.func @NCnc_to_NC(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
		%0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
		return %0 : tensor<256x128xf32>
		}

		transform.sequence failures(propagate) {
		^bb0(%arg1: !pdl.operation):
		%0 = transform.structured.match ops{["tensor.unpack"]} in %arg1
		%1, %loops:2 = transform.structured.tile_to_scf_for %0 [2, 4]
		}

		// -----

		// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)>
		// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)>
		// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)>
		// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0) -> (((d0 + 1) floordiv 32) * 32 - (d0 floordiv 32) * 32 + 32)>
		// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 8)>
		// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 8)>
		// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 8 - d0 floordiv 8 + 1)>
		// CHECK-DAG: #[[MAP7:.+]] = affine_map<(d0) -> (((d0 + 3) floordiv 8) * 8 - (d0 floordiv 8) * 8 + 8)>
		// CHECK: func.func @CKkc_to_KC
		// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]:
		// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]:
		// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
		// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
		// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
		// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
		// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index
		// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C128]] step %[[C2]]
		// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C256]] step %[[C4]]
		// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]])
		// CHECK-DAG: %[[OFFSET_K:.+]] = affine.apply #[[MAP1]](%[[K]])
		// CHECK-DAG: %[[IN_K_SZ:.+]] = affine.apply #[[MAP2]](%[[K]])
		// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP4]](%[[C]])
		// CHECK-DAG: %[[OFFSET_C:.+]] = affine.apply #[[MAP5]](%[[C]])
		// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP6]](%[[C]])
		// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]]
		// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], %[[IN_K_SZ]], 32, 8]
		// CHECK: %[[EMPTY:.+]] = tensor.empty
		// CHECK: %[[UNPACK:.+]] = tensor.unpack
		// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8]
		// CHECK-SAME: into %[[EMPTY]]
		// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]]
		// CHECK-SAME: [%[[OFFSET_K]], %[[OFFSET_C]]] [2, 4]
		// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]]
		// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4]
		// CHECK: scf.yield %[[RES]]
		func.func @CKkc_to_KC(%source: tensor<32x4x32x8xf32>, %dest: tensor<128x256xf32>) -> tensor<128x256xf32> {
		%0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %dest : tensor<32x4x32x8xf32> -> tensor<128x256xf32>
		return %0 : tensor<128x256xf32>
		}

		transform.sequence failures(propagate) {
		^bb0(%arg1: !pdl.operation):
		%0 = transform.structured.match ops{["tensor.unpack"]} in %arg1
		%1, %loops:2 = transform.structured.tile_to_scf_for %0 [2, 4]
		}

		// -----

		// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 2)>
		// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 floordiv 4)>
		// CHECK: func.func @perfect_CKkc_to_KC
		// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]:
		// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]:
		// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
		// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
		// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
		// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
		// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
		// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C2]]
		// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C128]] step %[[C4]]
		// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]])
		// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP1]](%[[C]])
		// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]]
		// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [1, 1, 2, 4]
		// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [2, 4]
		// TODO: Add FoldTensorCastOp patterns for unpack op, then we do not need
		// tensor.cast here.
		// CHECK: %[[ITER_CAST:.+]] = tensor.cast %[[ITER_SLICE]]
		// CHECK: %[[UNPACK:.+]] = tensor.unpack
		// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4]
		// CHECK-SAME: into %[[ITER_CAST]]
		// CHECK: %[[UNPACK_CAST:.+]] = tensor.cast %[[UNPACK]]
		// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_CAST]]
		// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4]
		// CHECK: scf.yield %[[RES]]
		func.func @perfect_CKkc_to_KC(%source: tensor<32x4x2x4xf32>, %dest: tensor<8x128xf32>) -> tensor<8x128xf32> {
		%0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %dest : tensor<32x4x2x4xf32> -> tensor<8x128xf32>
		return %0 : tensor<8x128xf32>
		}

		transform.sequence failures(propagate) {
		^bb0(%arg1: !pdl.operation):
		%0 = transform.structured.match ops{["tensor.unpack"]} in %arg1
		%1, %loops:2 = transform.structured.tile_to_scf_for %0 [2, 4]
		}

		// -----

		// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
		// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)>
		// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 2)>
		// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 2)>
		// CHECK: func.func @dynamic_perfect_CKkc_to_KC
		// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]:
		// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]:
		// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
		// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
		// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
		// CHECK-DAG: %[[DIM_0:.+]] = tensor.dim %[[OUT]], %[[C0]]
		// CHECK-DAG: %[[DIM_1:.+]] = tensor.dim %[[OUT]], %[[C1]]
		// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[DIM_0]] step %[[C2]]
		// CHECK-DAG: %[[OUT_K_SZ:.+]] = affine.min #[[MAP0]](%[[K]])[%[[DIM_0]]]
		// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[DIM_1]] step %[[C4]]
		// CHECK-DAG: %[[OUT_C_SZ:.+]] = affine.min #[[MAP1]](%[[C]])[%[[DIM_1]]]
		// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP2]](%[[K]])
		// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP2]](%[[C]])
		// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP3]](%[[OUT_C_SZ]])
		// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]]
		// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], 1, 2, 2]
		// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]]
		// CHECK: %[[UNPACK:.+]] = tensor.unpack
		// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2]
		// CHECK-SAME: into %[[ITER_SLICE]]
		// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]]
		// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]]
		// CHECK: scf.yield %[[RES]]
		func.func @dynamic_perfect_CKkc_to_KC(%source: tensor<?x?x2x2xf32>, %dest: tensor<?x?xf32>) -> tensor<?x?xf32> {
		%0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %dest : tensor<?x?x2x2xf32> -> tensor<?x?xf32>
		return %0 : tensor<?x?xf32>
		}

		transform.sequence failures(propagate) {
		^bb0(%arg1: !pdl.operation):
		%0 = transform.structured.match ops{["tensor.unpack"]} in %arg1
		%1, %loops:2 = transform.structured.tile_to_scf_for %0 [2, 4]
		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][tensor] Implement TilingInterface for unpack op
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 480213

mlir/include/mlir/Dialect/Affine/Utils.h

mlir/lib/Dialect/Tensor/IR/CMakeLists.txt

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp

mlir/test/Dialect/Tensor/tiling.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][tensor] Implement TilingInterface for unpack opClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 480213

mlir/include/mlir/Dialect/Affine/Utils.h

mlir/lib/Dialect/Tensor/IR/CMakeLists.txt

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp

mlir/test/Dialect/Tensor/tiling.mlir

[mlir][tensor] Implement TilingInterface for unpack op
ClosedPublic