Diff 460141

mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp

Show First 20 Lines • Show All 85 Lines • ▼ Show 20 Lines	static AffineMap reindexIndexingMap(AffineMap map) {
assert(map.isProjectedPermutation(/allowZeroInResults=/true) &&		assert(map.isProjectedPermutation(/allowZeroInResults=/true) &&
"expected projected permutation");		"expected projected permutation");
auto res = compressUnusedDims(map);		auto res = compressUnusedDims(map);
assert(res.getNumDims() == res.getNumResults() &&		assert(res.getNumDims() == res.getNumResults() &&
"expected reindexed map with same number of dims and results");		"expected reindexed map with same number of dims and results");
return res;		return res;
}		}

		/// Helper enum to represent conv1d input traversal order.
		enum class Conv1DOpOrder {
		Ncw, // Corresponds to operation that traverses the input in (n, c, w) order.
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions can we rephrase to avoid newlines? nicolasvasilache: can we rephrase to avoid newlines?
		raikonenfnuAuthorUnsubmitted Done Reply Inline Actions Done! raikonenfnu: Done!
		Nwc // Corresponds to operation that traverses the input in (n, w, c) order.
		};

/// Helper data structure to represent the result of vectorization.		/// Helper data structure to represent the result of vectorization.
		hanchungUnsubmitted Not Done Reply Inline Actions let's use enum class here, and we can use switch-case in `conv`. https://abseil.io/tips/86 hanchung: let's use enum class here, and we can use switch-case in `conv`. https://abseil.io/tips/86
		raikonenfnuAuthorUnsubmitted Done Reply Inline Actions great idea, done, thanks! :) raikonenfnu: great idea, done, thanks! :)
		hanchungUnsubmitted Done Reply Inline Actions We should use `enum class Conv1DOpOrder`, not `enum Conv1DOpOrder`. hanchung: We should use `enum class Conv1DOpOrder`, not `enum Conv1DOpOrder`.
		raikonenfnuAuthorUnsubmitted Done Reply Inline Actions done! :) raikonenfnu: done! :)
/// In certain specific cases, like terminators, we do not want to propagate/		/// In certain specific cases, like terminators, we do not want to propagate/
enum VectorizationStatus {		enum VectorizationStatus {
/// Op failed to vectorize.		/// Op failed to vectorize.
Failure = 0,		Failure = 0,
/// Op vectorized and custom function took care of replacement logic		/// Op vectorized and custom function took care of replacement logic
NoReplace,		NoReplace,
/// Op vectorized into a new Op whose results will replace original Op's		/// Op vectorized into a new Op whose results will replace original Op's
/// results.		/// results.
▲ Show 20 Lines • Show All 1,204 Lines • ▼ Show 20 Lines
/// Iters: ({Par(), Par(), Par(), Red(), Red()})		/// Iters: ({Par(), Par(), Par(), Red(), Red()})
/// Layout: {{n, strideW * w + dilationW * kw, c}, {kw, c, f}, {n, w, f}}		/// Layout: {{n, strideW * w + dilationW * kw, c}, {kw, c, f}, {n, w, f}}
/// ```		/// ```
/// kw is unrolled, w is unrolled iff dilationW > 1.		/// kw is unrolled, w is unrolled iff dilationW > 1.
///		///
/// or		/// or
///		///
/// ```		/// ```
		/// Op def: ( n, c, w, f, kw )
		/// Iters: ({Par(), Par(), Par(), Red(), Red()})
		/// Layout: {{n, c, strideW * w + dilationW * kw}, {f, c, kw}, {n, f, w}}
		/// ```
		/// kw is unrolled, w is unrolled iff dilationW > 1.
		///
		/// or
		///
		/// ```
/// Op def: ( n, w, c, kw )		/// Op def: ( n, w, c, kw )
/// Iters: ({Par(), Par(), Par(), Red()})		/// Iters: ({Par(), Par(), Par(), Red()})
/// Layout: {{n, strideW * w + dilationW * kw, c}, {kw, c}, {n, w, c}}		/// Layout: {{n, strideW * w + dilationW * kw, c}, {kw, c}, {n, w, c}}
/// ```		/// ```
/// kw is unrolled, w is unrolled iff dilationW > 1.		/// kw is unrolled, w is unrolled iff dilationW > 1.
struct Conv1DNwcGenerator : public StructuredGenerator<LinalgOp> {		struct Conv1DGenerator : public StructuredGenerator<LinalgOp> {
Conv1DNwcGenerator(OpBuilder &builder, LinalgOp linalgOp, int strideW,		Conv1DGenerator(OpBuilder &builder, LinalgOp linalgOp, int strideW,
int dilationW)		int dilationW)
: StructuredGenerator<LinalgOp>(builder, linalgOp), strideW(strideW),		: StructuredGenerator<LinalgOp>(builder, linalgOp), strideW(strideW),
dilationW(dilationW) {		dilationW(dilationW) {
// Determine whether `linalgOp` can be generated with this generator		// Determine whether `linalgOp` can be generated with this generator
if (linalgOp.getNumInputs() != 2 \|\| linalgOp.getNumOutputs() != 1)		if (linalgOp.getNumInputs() != 2 \|\| linalgOp.getNumOutputs() != 1)
return;		return;
lhsShaped = linalgOp.inputs()[0];		lhsShaped = linalgOp.inputs()[0];
rhsShaped = linalgOp.inputs()[1];		rhsShaped = linalgOp.inputs()[1];
resShaped = linalgOp.outputs()[0];		resShaped = linalgOp.outputs()[0];
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	struct Conv1DGenerator : public StructuredGenerator<LinalgOp> {
/// ```		/// ```
/// Op def: ( n, w, c, kw, f )		/// Op def: ( n, w, c, kw, f )
/// Iters: ({Par(), Par(), Par(), Red(), Red()})		/// Iters: ({Par(), Par(), Par(), Red(), Red()})
/// Layout: {{n, strideW * w + dilationW * kw, c}, {kw, c, f}, {n, w, f}}		/// Layout: {{n, strideW * w + dilationW * kw, c}, {kw, c, f}, {n, w, f}}
/// ```		/// ```
/// kw is always unrolled.		/// kw is always unrolled.
/// TODO: w (resp. kw) is unrolled when the strideW ( resp. dilationW) is		/// TODO: w (resp. kw) is unrolled when the strideW ( resp. dilationW) is
/// > 1.		/// > 1.
FailureOr<Operation *> conv() {		FailureOr<Operation *> conv(Conv1DOpOrder conv1DOpOrder) {
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Should this be an enum Conv1DOpOrder { Ncw, // This corresponds to a computation that traverses the input in order In(n, c, w) Nwc // This corresponds to a computation that traverses the input in order In(n, w, c) }; etc .. ? Note: I called this Conv1DOpOrder, it is definitely not a "data layout" but really an op definition property. nicolasvasilache: Should this be an ``` enum Conv1DOpOrder { Ncw, // This corresponds to a computation that…
if (!valid)		if (!valid)
return failure();		return failure();

int64_t nSize, wSize, cSize, kwSize, fSize;		int64_t nSize, wSize, cSize, kwSize, fSize;
		SmallVector<int64_t, 3> lhsShape, rhsShape, resShape;
		switch (conv1DOpOrder) {
		case Conv1DOpOrder::Nwc:
// kernel{kw, c, f}		// kernel{kw, c, f}
bindShapeDims(rhsShapedType, kwSize, cSize, fSize);		bindShapeDims(rhsShapedType, kwSize, cSize, fSize);
// out{n, w, f}		// out{n, w, f}
bindShapeDims(resShapedType, nSize, wSize);		bindShapeDims(resShapedType, nSize, wSize);
		lhsShape = {nSize,
		// iw = ow * sw + kw * dw - 1
		// (i.e. 16 convolved with 3 (@stride 1 dilation 1) -> 14)
		// Perform the proper inclusive -> exclusive -> inclusive.
		((wSize - 1) * strideW + 1) + ((kwSize - 1) * dilationW + 1) -
		1,
		cSize};
		rhsShape = {kwSize, cSize, fSize};
		resShape = {nSize, wSize, fSize};
		break;
		case Conv1DOpOrder::Ncw:
		// kernel{f, c, kw}
		bindShapeDims(rhsShapedType, fSize, cSize, kwSize);
		// out{n, f, w}
		bindShapeDims(resShapedType, nSize, fSize, wSize);
		lhsShape = {nSize, cSize,
		// iw = ow * sw + kw * dw - 1
		// (i.e. 16 convolved with 3 (@stride 1 dilation 1) -> 14)
		// Perform the proper inclusive -> exclusive -> inclusive.
		((wSize - 1) * strideW + 1) + ((kwSize - 1) * dilationW + 1) -
		1};
		rhsShape = {fSize, cSize, kwSize};
		resShape = {nSize, fSize, wSize};
		break;
		default:
		return failure();
		}

vector::TransferWriteOp write;		vector::TransferWriteOp write;
Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);		Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);

// w is unrolled (i.e. wSizeStep == 1) iff strideW > 1.		// w is unrolled (i.e. wSizeStep == 1) iff strideW > 1.
// When strideW == 1, we can batch the contiguous loads and avoid		// When strideW == 1, we can batch the contiguous loads and avoid
// unrolling		// unrolling
int64_t wSizeStep = strideW == 1 ? wSize : 1;		int64_t wSizeStep = strideW == 1 ? wSize : 1;

Type lhsEltType = lhsShapedType.getElementType();		Type lhsEltType = lhsShapedType.getElementType();
Type rhsEltType = rhsShapedType.getElementType();		Type rhsEltType = rhsShapedType.getElementType();
Type resEltType = resShapedType.getElementType();		Type resEltType = resShapedType.getElementType();
VectorType lhsType = VectorType::get(		auto lhsType = VectorType::get(lhsShape, lhsEltType);
{nSize,		auto rhsType = VectorType::get(rhsShape, rhsEltType);
// iw = ow * sw + kw * dw - 1		auto resType = VectorType::get(resShape, resEltType);
// (i.e. 16 convolved with 3 (@stride 1 dilation 1) -> 14)
// Perform the proper inclusive -> exclusive -> inclusive.
((wSize - 1) * strideW + 1) + ((kwSize - 1) * dilationW + 1) - 1,
cSize},
lhsEltType);
VectorType rhsType = VectorType::get({kwSize, cSize, fSize}, rhsEltType);
VectorType resType = VectorType::get({nSize, wSize, fSize}, resEltType);

// Read lhs slice of size {w * strideW + kw * dilationW, c, f} @ [0, 0,		// Read lhs slice of size {w * strideW + kw * dilationW, c, f} @ [0, 0,
		hanchungUnsubmitted Not Done Reply Inline Actions nit: auto VectorType::get already spells the type. hanchung: nit: auto VectorType::get already spells the type.
// 0].		// 0].
Value lhs = builder.create<vector::TransferReadOp>(		Value lhs = builder.create<vector::TransferReadOp>(
loc, lhsType, lhsShaped, ValueRange{zero, zero, zero});		loc, lhsType, lhsShaped, ValueRange{zero, zero, zero});
// Read rhs slice of size {kw, c, f} @ [0, 0, 0].		// Read rhs slice of size {kw, c, f} @ [0, 0, 0].
Value rhs = builder.create<vector::TransferReadOp>(		Value rhs = builder.create<vector::TransferReadOp>(
loc, rhsType, rhsShaped, ValueRange{zero, zero, zero});		loc, rhsType, rhsShaped, ValueRange{zero, zero, zero});
// Read res slice of size {n, w, f} @ [0, 0, 0].		// Read res slice of size {n, w, f} @ [0, 0, 0].
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions Please use an enum and spell out the shapes instead of swapping. Swapping has a level of indirection that makes things tricky to follow. nicolasvasilache: Please use an enum and spell out the shapes instead of swapping. Swapping has a level of…
Value res = builder.create<vector::TransferReadOp>(		Value res = builder.create<vector::TransferReadOp>(
loc, resType, resShaped, ValueRange{zero, zero, zero});		loc, resType, resShaped, ValueRange{zero, zero, zero});

		// The base vectorization case is input: {n,w,c}, weight: {kw,c,f}, output:
		// {n,w,f}. To reuse the base pattern vectorization case, we do pre
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions You could say: // The base vectorization case is input: nwc, weight: ..., output: ... if (enum == Conv1DOpOrder::Nwc) ; // To match the base vectorization case, we pre/post transpose the case we have. if (enum == Conv1DOpOrder::Ncw) { // Convert input: ncw -> nwc. lhs = builder.create<vector::TransposeOp>(loc, lhs, ArrayRef<int64_t>{0, 2, 1}); ... } I find this makes the nicolasvasilache: You could say: ``` // The base vectorization case is input: nwc, weight: ..., output: ... if…
		raikonenfnuAuthorUnsubmitted Not Done Reply Inline Actions Thanks for the great comments and suggestions! I have addressed most of these, please let me know if I am miss anything :) raikonenfnu: Thanks for the great comments and suggestions! I have addressed most of these, please let me…
		// transpose on input, weight, and output.
		switch (conv1DOpOrder) {
		case Conv1DOpOrder::Nwc:
		// Base case, so no transposes necessary.
		break;
		case Conv1DOpOrder::Ncw:
		// To match base vectorization case, we pre-transpose current case.
		// ncw -> nwc
		static constexpr std::array<int64_t, 3> permLhs = {0, 2, 1};
		lhs = builder.create<vector::TransposeOp>(loc, lhs, permLhs);
		// fcw -> wcf
		static constexpr std::array<int64_t, 3> permRhs = {2, 1, 0};
		rhs = builder.create<vector::TransposeOp>(loc, rhs, permRhs);
		// nfw -> nwf
		static constexpr std::array<int64_t, 3> permRes = {0, 2, 1};
		res = builder.create<vector::TransposeOp>(loc, res, permRes);
		break;
		default:
		return failure();
		}

//===------------------------------------------------------------------===//		//===------------------------------------------------------------------===//
// Begin vector-only rewrite part		// Begin vector-only rewrite part
//===------------------------------------------------------------------===//		//===------------------------------------------------------------------===//
// Unroll along kw and read slices of lhs and rhs.		// Unroll along kw and read slices of lhs and rhs.
SmallVector<Value> lhsVals, rhsVals, resVals;		SmallVector<Value> lhsVals, rhsVals, resVals;
// Extract lhs slice of size {n, wSizeStep, c} @ [0, sw * w + dw * kw, 0].		// Extract lhs slice of size {n, wSizeStep, c} @ [0, sw * w + dw * kw, 0].
for (int64_t kw = 0; kw < kwSize; ++kw) {		for (int64_t kw = 0; kw < kwSize; ++kw) {
for (int64_t w = 0; w < wSize; w += wSizeStep) {		for (int64_t w = 0; w < wSize; w += wSizeStep) {
Show All 37 Lines	for (int64_t w = 0; w < wSize; w += wSizeStep) {
loc, resVals[w], res,		loc, resVals[w], res,
/offsets=/ArrayRef<int64_t>{0, w, 0},		/offsets=/ArrayRef<int64_t>{0, w, 0},
/strides=/ArrayRef<int64_t>{1, 1, 1});		/strides=/ArrayRef<int64_t>{1, 1, 1});
}		}
//===------------------------------------------------------------------===//		//===------------------------------------------------------------------===//
// End vector-only rewrite part		// End vector-only rewrite part
//===------------------------------------------------------------------===//		//===------------------------------------------------------------------===//

		// The base vectorization case is output: {n,w,f}
		// To reuse the result from base pattern vectorization case, we post
		// transpose the base case result.
		switch (conv1DOpOrder) {
		hanchungUnsubmitted Not Done Reply Inline Actions It's better to use switch-case. With `enum class`, we won't miss updating the snippet when adding a new order. hanchung: It's better to use switch-case. With `enum class`, we won't miss updating the snippet when…
		case Conv1DOpOrder::Nwc:
		// Base case, so no transposes necessary.
		break;
		case Conv1DOpOrder::Ncw:
		hanchungUnsubmitted Not Done Reply Inline Actions nit: permRes, maybe just rename it to `perm`. hanchung: nit: permRes, maybe just rename it to `perm`.
		// nwf -> nfw
		static constexpr std::array<int64_t, 3> perm = {0, 2, 1};
		res = builder.create<vector::TransposeOp>(loc, res, perm);
		break;
		default:
		return failure();
		}

// Write back res slice of size {n, w, f} @ [0, 0, 0].		// Write back res slice of size {n, w, f} @ [0, 0, 0].
return builder		return builder
.create<vector::TransferWriteOp>(loc, res, resShaped,		.create<vector::TransferWriteOp>(loc, res, resShaped,
ValueRange{zero, zero, zero})		ValueRange{zero, zero, zero})
.getOperation();		.getOperation();
}		}

// Create a contraction: lhs{n, w, c} * rhs{c, f} -> res{n, w, f}		// Create a contraction: lhs{n, w, c} * rhs{c, f} -> res{n, w, f}
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines	struct Conv1DGenerator : public StructuredGenerator<LinalgOp> {
Value depthwiseConv1dSliceAsFma(OpBuilder &b, Location loc, Value lhs,		Value depthwiseConv1dSliceAsFma(OpBuilder &b, Location loc, Value lhs,
Value rhs, Value res) {		Value rhs, Value res) {
Value bcast = builder.create<vector::BroadcastOp>(loc, res.getType(), rhs);		Value bcast = builder.create<vector::BroadcastOp>(loc, res.getType(), rhs);
return b.create<vector::FMAOp>(loc, lhs, bcast, res);		return b.create<vector::FMAOp>(loc, lhs, bcast, res);
}		}

/// Entry point that transposes into the common form:		/// Entry point that transposes into the common form:
/// {{n, strideW * w + dilationW * kw, c}, {kw, c, f}, {n, w, f}}		/// {{n, strideW * w + dilationW * kw, c}, {kw, c, f}, {n, w, f}}
FailureOr<Operation *> generateConv() {		FailureOr<Operation *> generateNwcConv() {
AffineExpr n, w, f, kw, c;		AffineExpr n, w, f, kw, c;
bindDims(ctx, n, w, f, kw, c);		bindDims(ctx, n, w, f, kw, c);
if (!iters({Par(), Par(), Par(), Red(), Red()}))		if (!iters({Par(), Par(), Par(), Red(), Red()}))
return failure();		return failure();

// No transposition needed.		// No transposition needed.
if (layout({/lhsIndex/ {n, strideW * w + dilationW * kw, c},		if (layout({/lhsIndex/ {n, strideW * w + dilationW * kw, c},
/rhsIndex/ {kw, c, f},		/rhsIndex/ {kw, c, f},
/resIndex/ {n, w, f}}))		/resIndex/ {n, w, f}}))
return conv();		return conv(Conv1DOpOrder::Nwc);
		return failure();
		}

		/// Entry point that transposes into the common form:
		/// {{n, c, strideW * w + dilationW * kw}, {f, c, kw}, {n, f, w}}
		FailureOr<Operation *> generateNcwConv() {
		AffineExpr n, w, f, kw, c;
		bindDims(ctx, n, f, w, c, kw);
		if (!iters({Par(), Par(), Par(), Red(), Red()}))
		return failure();

		if (layout({/lhsIndex/ {n, c, strideW * w + dilationW * kw},
		/rhsIndex/ {f, c, kw},
		/resIndex/ {n, f, w}}))
		return conv(Conv1DOpOrder::Ncw);

return failure();		return failure();
}		}

/// Entry point that transposes into the common form:		/// Entry point that transposes into the common form:
/// {{n, strideW * w + dilationW * kw, c}, {kw, c}, {n, w, c}}		/// {{n, strideW * w + dilationW * kw, c}, {kw, c}, {n, w, c}}
FailureOr<Operation *> generateDilatedConv() {		FailureOr<Operation *> generateDilatedConv() {
AffineExpr n, w, c, kw;		AffineExpr n, w, c, kw;
bindDims(ctx, n, w, c, kw);		bindDims(ctx, n, w, c, kw);
Show All 22 Lines	static FailureOr<Operation *> vectorizeConvolution(OpBuilder &b, LinalgOp op) {
// The ConvolutionOpInterface gives us guarantees of existence for		// The ConvolutionOpInterface gives us guarantees of existence for
// strides/dilations. However, we do not need to rely on those, we can simply		// strides/dilations. However, we do not need to rely on those, we can simply
// use them if present, otherwise use the default and let the generic conv.		// use them if present, otherwise use the default and let the generic conv.
// matcher in the ConvGenerator succeed or fail.		// matcher in the ConvGenerator succeed or fail.
auto strides = op->getAttrOfType<DenseIntElementsAttr>("strides");		auto strides = op->getAttrOfType<DenseIntElementsAttr>("strides");
auto dilations = op->getAttrOfType<DenseIntElementsAttr>("dilations");		auto dilations = op->getAttrOfType<DenseIntElementsAttr>("dilations");
auto stride = strides ? *strides.getValues<uint64_t>().begin() : 1;		auto stride = strides ? *strides.getValues<uint64_t>().begin() : 1;
auto dilation = dilations ? *dilations.getValues<uint64_t>().begin() : 1;		auto dilation = dilations ? *dilations.getValues<uint64_t>().begin() : 1;
Conv1DNwcGenerator e(b, op, stride, dilation);		Conv1DGenerator e(b, op, stride, dilation);
auto res = e.generateConv();		auto res = e.generateNwcConv();
		if (succeeded(res))
		return res;
		res = e.generateNcwConv();
if (succeeded(res))		if (succeeded(res))
return res;		return res;
return e.generateDilatedConv();		return e.generateDilatedConv();
}		}

struct VectorizeConvolution : public OpInterfaceRewritePattern<LinalgOp> {		struct VectorizeConvolution : public OpInterfaceRewritePattern<LinalgOp> {
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;		using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

Show All 20 Lines

mlir/test/Dialect/Linalg/vectorize-convolution.mlir

	Show First 20 Lines • Show All 181 Lines • ▼ Show 20 Lines
	// CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER_1]], %[[CONTRACT_0]]			// CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER_1]], %[[CONTRACT_0]]
	// CHECK-SAME: : vector<4x2x3xf32>, vector<3x8xf32> into vector<4x2x8xf32>			// CHECK-SAME: : vector<4x2x3xf32>, vector<3x8xf32> into vector<4x2x8xf32>

	// Write the result back in one shot.			// Write the result back in one shot.
	// CHECK: vector.transfer_write %[[CONTRACT_1]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]			// CHECK: vector.transfer_write %[[CONTRACT_1]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]

	// -----			// -----

				func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x3x1xf32>, %output: memref<4x8x2xf32>) {
				linalg.conv_1d_ncw_fcw
				{dilations = dense<1> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}
				ins(%input, %filter : memref<4x3x6xf32>, memref<8x3x1xf32>)
				outs(%output : memref<4x8x2xf32>)
				return
				}

				// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
				// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
				// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>

				// CHECK: func @conv1d_ncw_4x8x2_memref
				// CHECK-SAME: (%[[INPUT:.+]]: memref<4x3x6xf32>, %[[FILTER:.+]]: memref<8x3x1xf32>, %[[OUTPUT:.+]]: memref<4x8x2xf32>)

				// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
				// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32

				/// Read the whole data in one shot.
				// CHECK-DAG: %[[V_NWC_INPUT_R:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_NWC_FILTER_R:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_NWC_OUTPUT_R:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]

				/// Transpose result to nwc format.
				// CHECK-DAG: %[[V_INPUT_R:.+]] = vector.transpose %[[V_NWC_INPUT_R]], [0, 2, 1]
				// CHECK-DAG: %[[V_FILTER_R:.+]] = vector.transpose %[[V_NWC_FILTER_R]], [2, 1, 0]
				// CHECK-DAG: %[[V_OUTPUT_R:.+]] = vector.transpose %[[V_NWC_OUTPUT_R]], [0, 2, 1]

				// CHECK: %[[V_INPUT_0:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x4x3xf32> to vector<4x1x3xf32>
				// CHECK: %[[V_INPUT_1:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 3, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x4x3xf32> to vector<4x1x3xf32>

				// CHECK: %[[V_FILTER:.+]] = vector.extract %[[V_FILTER_R]][0] : vector<1x3x8xf32>

				// CHECK: %[[V_OUTPUT_0:.+]] = vector.extract_strided_slice %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 1, 8], strides = [1, 1, 1]} : vector<4x2x8xf32> to vector<4x1x8xf32>
				// CHECK: %[[V_OUTPUT_1:.+]] = vector.extract_strided_slice %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 1, 0], sizes = [4, 1, 8], strides = [1, 1, 1]} : vector<4x2x8xf32> to vector<4x1x8xf32>

				/// w == 0, kw == 0
				// CHECK: %[[CONTRACT_0:.+]] = vector.contract {
				// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: %[[V_INPUT_0]], %[[V_FILTER]], %[[V_OUTPUT_0]]
				// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>

				/// w == 1, kw == 0
				// CHECK: %[[CONTRACT_1:.+]] = vector.contract {
				// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER]], %[[V_OUTPUT_1]]
				// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>

				/// w == 0, kw == 0
				// CHECK: %[[RES_0:.+]] = vector.insert_strided_slice %[[CONTRACT_0]], %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x1x8xf32> into vector<4x2x8xf32>
				/// w == 1, kw == 0
				// CHECK: %[[RES_1:.+]] = vector.insert_strided_slice %[[CONTRACT_1]], %[[RES_0]]
				// CHECK-SAME: {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<4x1x8xf32> into vector<4x2x8xf32>

				/// Transpose result to ncw format.
				// CHECK: %[[RES_2:.+]] = vector.transpose %[[RES_1]], [0, 2, 1]

				// Write the result back in one shot.
				// CHECK: vector.transfer_write %[[RES_2]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]

				// -----

				hanchungUnsubmitted Not Done Reply Inline Actions remove the empty test? hanchung: remove the empty test?
				raikonenfnuAuthorUnsubmitted Done Reply Inline Actions good catch, done! raikonenfnu: good catch, done!
				func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x3x2xf32>, %output: memref<4x8x2xf32>) {
				linalg.conv_1d_ncw_fcw
				{dilations = dense<2> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}
				ins(%input, %filter : memref<4x3x6xf32>, memref<8x3x2xf32>)
				outs(%output : memref<4x8x2xf32>)
				return
				}

				// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
				// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
				// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>

				// CHECK: func @conv1d_ncw_4x8x2_memref
				// CHECK-SAME: (%[[INPUT:.+]]: memref<4x3x6xf32>, %[[FILTER:.+]]: memref<8x3x2xf32>, %[[OUTPUT:.+]]: memref<4x8x2xf32>)

				// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
				// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32

				/// Read the whole data in one shot.
				// CHECK-DAG: %[[V_NWC_INPUT_R:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_NWC_FILTER_R:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_NWC_OUTPUT_R:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]

				/// Transpose result to nwc format.
				// CHECK-DAG: %[[V_INPUT_R:.+]] = vector.transpose %[[V_NWC_INPUT_R]], [0, 2, 1]
				// CHECK-DAG: %[[V_FILTER_R:.+]] = vector.transpose %[[V_NWC_FILTER_R]], [2, 1, 0]
				// CHECK-DAG: %[[V_OUTPUT_R:.+]] = vector.transpose %[[V_NWC_OUTPUT_R]], [0, 2, 1]

				// CHECK: %[[V_INPUT_0:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x6x3xf32> to vector<4x1x3xf32>
				// CHECK: %[[V_INPUT_1:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 3, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x6x3xf32> to vector<4x1x3xf32>
				// CHECK: %[[V_INPUT_2:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 2, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x6x3xf32> to vector<4x1x3xf32>
				// CHECK: %[[V_INPUT_3:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 5, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x6x3xf32> to vector<4x1x3xf32>

				// CHECK: %[[V_FILTER_0:.+]] = vector.extract %[[V_FILTER_R]][0] : vector<2x3x8xf32>
				// CHECK: %[[V_FILTER_1:.+]] = vector.extract %[[V_FILTER_R]][1] : vector<2x3x8xf32>

				// CHECK: %[[V_OUTPUT_0:.+]] = vector.extract_strided_slice %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 1, 8], strides = [1, 1, 1]} : vector<4x2x8xf32> to vector<4x1x8xf32>
				// CHECK: %[[V_OUTPUT_1:.+]] = vector.extract_strided_slice %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 1, 0], sizes = [4, 1, 8], strides = [1, 1, 1]} : vector<4x2x8xf32> to vector<4x1x8xf32>

				/// w == 0, kw == 0
				// CHECK: %[[CONTRACT_0:.+]] = vector.contract {
				// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: %[[V_INPUT_0]], %[[V_FILTER_0]], %[[V_OUTPUT_0]]
				// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
				/// w == 1, kw == 0
				// CHECK: %[[CONTRACT_1:.+]] = vector.contract {
				// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER_0]], %[[V_OUTPUT_1]]
				// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
				/// w == 1, kw == 1
				// CHECK: %[[CONTRACT_2:.+]] = vector.contract {
				// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: %[[V_INPUT_2]], %[[V_FILTER_1]], %[[CONTRACT_0]]
				// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
				/// w == 1, kw == 1
				// CHECK: %[[CONTRACT_3:.+]] = vector.contract {
				// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: %[[V_INPUT_3]], %[[V_FILTER_1]], %[[CONTRACT_1]]
				// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>

				/// w == 0, kw == 0
				// CHECK: %[[RES_0:.+]] = vector.insert_strided_slice %[[CONTRACT_2]], %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x1x8xf32> into vector<4x2x8xf32>
				/// w == 1, kw == 0
				// CHECK: %[[RES_1:.+]] = vector.insert_strided_slice %[[CONTRACT_3]], %[[RES_0]]
				// CHECK-SAME: {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<4x1x8xf32> into vector<4x2x8xf32>

				/// Transpose result to ncw format.
				// CHECK: %[[RES_2:.+]] = vector.transpose %[[RES_1]], [0, 2, 1]

				// Write the result back in one shot.
				// CHECK: vector.transfer_write %[[RES_2]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]

				// -----

				func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x3x2xf32>, %output: memref<4x8x2xf32>) {
				linalg.conv_1d_ncw_fcw
				{dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
				ins(%input, %filter : memref<4x3x6xf32>, memref<8x3x2xf32>)
				outs(%output : memref<4x8x2xf32>)
				return
				}

				// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
				// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
				// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>

				// CHECK: func @conv1d_ncw_4x8x2_memref
				// CHECK-SAME: (%[[INPUT:.+]]: memref<4x3x6xf32>, %[[FILTER:.+]]: memref<8x3x2xf32>, %[[OUTPUT:.+]]: memref<4x8x2xf32>)

				// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
				// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32

				/// Read the whole data in one shot.
				// CHECK-DAG: %[[V_NWC_INPUT_R:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_NWC_FILTER_R:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_NWC_OUTPUT_R:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]

				/// Transpose result to nwc format.
				// CHECK-DAG: %[[V_INPUT_R:.+]] = vector.transpose %[[V_NWC_INPUT_R]], [0, 2, 1]
				// CHECK-DAG: %[[V_FILTER_R:.+]] = vector.transpose %[[V_NWC_FILTER_R]], [2, 1, 0]
				// CHECK-DAG: %[[V_OUTPUT_R:.+]] = vector.transpose %[[V_NWC_OUTPUT_R]], [0, 2, 1]

				// CHECK: %[[V_INPUT_0:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 2, 3], strides = [1, 1, 1]} : vector<4x4x3xf32> to vector<4x2x3xf32>
				// CHECK: %[[V_INPUT_1:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 2, 0], sizes = [4, 2, 3], strides = [1, 1, 1]} : vector<4x4x3xf32> to vector<4x2x3xf32>

				// CHECK: %[[V_FILTER_0:.+]] = vector.extract %[[V_FILTER_R]][0] : vector<2x3x8xf32>
				// CHECK: %[[V_FILTER_1:.+]] = vector.extract %[[V_FILTER_R]][1] : vector<2x3x8xf32>

				/// w == 0, kw == 0
				// CHECK: %[[CONTRACT_0:.+]] = vector.contract {
				// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: %[[V_INPUT_0]], %[[V_FILTER_0]], %[[V_OUTPUT_R]]
				// CHECK-SAME: : vector<4x2x3xf32>, vector<3x8xf32> into vector<4x2x8xf32>
				/// w == 0, kw == 1
				// CHECK: %[[CONTRACT_1:.+]] = vector.contract {
				// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
				// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
				// CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER_1]], %[[CONTRACT_0]]
				// CHECK-SAME: : vector<4x2x3xf32>, vector<3x8xf32> into vector<4x2x8xf32>

				/// Transpose result to ncw format.
				// CHECK: %[[RES:.+]] = vector.transpose %[[CONTRACT_1]], [0, 2, 1]

				// Write the result back in one shot.
				// CHECK: vector.transfer_write %[[RES]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]


				// -----

	func.func @depthwise_conv1d_nwc_wc_3x5x4_memref(%input: memref<3x5x4xf32>, %filter: memref<2x4xf32>, %output: memref<3x2x4xf32>) {			func.func @depthwise_conv1d_nwc_wc_3x5x4_memref(%input: memref<3x5x4xf32>, %filter: memref<2x4xf32>, %output: memref<3x2x4xf32>) {
	linalg.depthwise_conv_1d_nwc_wc			linalg.depthwise_conv_1d_nwc_wc
	{dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}			{dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
	ins(%input, %filter : memref<3x5x4xf32>, memref<2x4xf32>)			ins(%input, %filter : memref<3x5x4xf32>, memref<2x4xf32>)
	outs(%output : memref<3x2x4xf32>)			outs(%output : memref<3x2x4xf32>)
	return			return
	}			}

	▲ Show 20 Lines • Show All 55 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Vectorization for conv_1d_ncw_fcw
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460141

mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp

mlir/test/Dialect/Linalg/vectorize-convolution.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][linalg] Vectorization for conv_1d_ncw_fcwClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 460141

mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp

mlir/test/Dialect/Linalg/vectorize-convolution.mlir

[mlir][linalg] Vectorization for conv_1d_ncw_fcw
ClosedPublic