This is an archive of the discontinued LLVM Phabricator instance.

[mlir][Linalg] Refactor vectorization of conv1d more aggressively.
ClosedPublic

Authored by nicolasvasilache on Oct 29 2021, 8:02 AM.

Download Raw Diff

Details

Reviewers

ThomasRaoux
mravishankar
antiagainst
ftynse
pifon2a
aartbik

Commits

rG9c4971740b87: [mlir][Linalg] Refactor vectorization of conv1d more aggressively.

Summary

This is more of a proof of concept for now although it is correct and does not result in noticeable perf degradations.

This is what a better decoupling of transfer read/write from vectorization of conv would look like. This form is close to ready to plop into a new vector.conv op and the vector.transfer operations to be generalized as part of generic vectorization once the properties ConvolutionOpInterface are inferred from the indexing maps.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

nicolasvasilache created this revision.Oct 29 2021, 8:02 AM

Herald added a reviewer: aartbik. · View Herald TranscriptOct 29 2021, 8:02 AM

Herald added subscribers: wenzhicui, wrengr, Chia-hungDuan and 19 others. · View Herald Transcript

nicolasvasilache requested review of this revision.Oct 29 2021, 8:02 AM

Herald added a project: Restricted Project. · View Herald TranscriptOct 29 2021, 8:02 AM

Herald added subscribers: limo1996, stephenneuendorffer. · View Herald Transcript

Harbormaster completed remote builds in B131429: Diff 383357.Oct 29 2021, 8:03 AM

ThomasRaoux accepted this revision.Nov 1 2021, 3:34 PM

This revision is now accepted and ready to land.Nov 1 2021, 3:34 PM

Nice! In my impl in IREE I was sort of doing this load-whole-and-extract-slice pattern, but only for filters. I'd assume it should be helpful because we load the full filter ahead and put it in registers to increase reuse? But CPU might differ here.

Rebase.

Harbormaster completed remote builds in B132158: Diff 384356.Nov 3 2021, 1:27 AM

Closed by commit rG9c4971740b87: [mlir][Linalg] Refactor vectorization of conv1d more aggressively. (authored by nicolasvasilache). · Explain WhyNov 3 2021, 1:29 AM

This revision was automatically updated to reflect the committed changes.

nicolasvasilache added a commit: rG9c4971740b87: [mlir][Linalg] Refactor vectorization of conv1d more aggressively..

Revision Contents

Path

Size

mlir/

lib/

Dialect/

Linalg/

Transforms/

Vectorization.cpp

114 lines

test/

Dialect/

Linalg/

vectorize-convolution.mlir

160 lines

Diff 384359

mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp

Show First 20 Lines • Show All 1,452 Lines • ▼ Show 20 Lines	FailureOr<Operation *> conv() {

vector::TransferWriteOp write;		vector::TransferWriteOp write;
Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);		Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);

// w is unrolled (i.e. wSizeStep == 1) iff strideW > 1.		// w is unrolled (i.e. wSizeStep == 1) iff strideW > 1.
// When strideW == 1, we can batch the contiguous loads and avoid unrolling		// When strideW == 1, we can batch the contiguous loads and avoid unrolling
int64_t wSizeStep = strideW == 1 ? wSize : 1;		int64_t wSizeStep = strideW == 1 ? wSize : 1;

VectorType lhsType = VectorType::get({nSize, wSizeStep, cSize},		Type lhsEltType = lhsShapedType.getElementType();
lhsShapedType.getElementType());		Type rhsEltType = rhsShapedType.getElementType();
VectorType rhsType =		Type resEltType = resShapedType.getElementType();
VectorType::get({cSize, fSize}, rhsShapedType.getElementType());		VectorType lhsType = VectorType::get(
VectorType resType = VectorType::get({nSize, wSizeStep, fSize},		{nSize, (wSize - 1) * strideW + 1 + (kwSize - 1) * dilationW + 1,
resShapedType.getElementType());		cSize},
		lhsEltType);
SmallVector<Value> lhsVals, rhsVals, resVals;		VectorType rhsType = VectorType::get({kwSize, cSize, fSize}, rhsEltType);
		VectorType resType = VectorType::get({nSize, wSize, fSize}, resEltType);

		// Read lhs slice of size {w * strideW + kw * dilationW, c, f} @ [0, 0, 0].
		Value lhs = builder.create<vector::TransferReadOp>(
		loc, lhsType, lhsShaped, ValueRange{zero, zero, zero});
		// Read rhs slice of size {kw, c, f} @ [0, 0, 0].
		Value rhs = builder.create<vector::TransferReadOp>(
		loc, rhsType, rhsShaped, ValueRange{zero, zero, zero});
		// Read res slice of size {n, w, f} @ [0, 0, 0].
		Value res = builder.create<vector::TransferReadOp>(
		loc, resType, resShaped, ValueRange{zero, zero, zero});

		//===------------------------------------------------------------------===//
		// Begin vector-only rewrite part
		//===------------------------------------------------------------------===//
// Unroll along kw and read slices of lhs and rhs.		// Unroll along kw and read slices of lhs and rhs.
// Alternatively we could preload both 3-d slices and extract smaller slices		SmallVector<Value> lhsVals, rhsVals, resVals;
// iteratively without touching memory. But this will quickly spill.
for (int64_t kw = 0; kw < kwSize; ++kw) {		for (int64_t kw = 0; kw < kwSize; ++kw) {
// Read rhs slice of size {c, f} @ [kw, 0, 0].		// Extract rhs slice of size {c, f} @ [kw].
Value kwVal = builder.create<arith::ConstantIndexOp>(loc, kw);		rhsVals.push_back(builder.create<vector::ExtractOp>(
rhsVals.push_back(builder.create<vector::TransferReadOp>(		loc, rhs, /offsets=/ArrayRef<int64_t>{kw}));
loc, rhsType, rhsShaped, ValueRange{kwVal, zero, zero}));

for (int64_t w_iv = 0; w_iv < wSize; w_iv += wSizeStep) {		for (int64_t w = 0; w < wSize; w += wSizeStep) {
// Read lhs slice of size {n, wSizeStep, c}		// Extract lhs slice of size {n, wSizeStep, c}
// @ [0, sw * w + dw * kw, 0].		// @ [0, sw * w + dw * kw, 0].
Value lhsStridedIdx = builder.create<arith::ConstantIndexOp>(		lhsVals.push_back(builder.create<vector::ExtractStridedSliceOp>(
loc, strideW * w_iv + dilationW * kw);		loc, lhs,
lhsVals.push_back(builder.create<vector::TransferReadOp>(		/offsets=/ArrayRef<int64_t>{0, w * strideW + kw * dilationW, 0},
loc, lhsType, lhsShaped, ValueRange{zero, lhsStridedIdx, zero}));		/sizes=/ArrayRef<int64_t>{nSize, wSizeStep, cSize},
		/strides=/ArrayRef<int64_t>{1, 1, 1}));
// Read res slice: {n, wSizeStep, f} @ [0, w, 0].
Value wVal = builder.create<arith::ConstantIndexOp>(loc, w_iv);		// This does not depend on kw.
// When operating on tensors, reading from the updated value is required		if (kw == 0) {
// for vector.transfer_read/write hoisting to function as expected.		// Extract res slice: {n, wSizeStep, f} @ [0, w, 0].
resVals.push_back(builder.create<vector::TransferReadOp>(		resVals.push_back(builder.create<vector::ExtractStridedSliceOp>(
loc, resType, resShaped, ValueRange{zero, wVal, zero}));		loc, res,
		/offsets=/ArrayRef<int64_t>{0, w, 0},
		/sizes=/ArrayRef<int64_t>{nSize, wSizeStep, fSize},
		/strides=/ArrayRef<int64_t>{1, 1, 1}));
}		}
}		}
for (int64_t kw = 0; kw < kwSize; ++kw) {
for (int64_t w_iv = 0; w_iv < wSize; w_iv += wSizeStep) {
// Compute contraction: I{n, w, c} * F{c, f} -> O{n, w, f}
resVals[kw * (wSize / wSizeStep) + w_iv] = conv1dSliceAsContraction(
builder, loc, lhsVals[kw * (wSize / wSizeStep) + w_iv], rhsVals[kw],
resVals[kw * (wSize / wSizeStep) + w_iv]);
}
}		}

		auto linearIndex = [&](int64_t kw, int64_t w) {
		return kw * (wSize / wSizeStep) + w;
		};

		// Compute contraction: O{n, w, f} += I{n, sw * w + dw * kw, c} * F{c, f}
for (int64_t kw = 0; kw < kwSize; ++kw) {		for (int64_t kw = 0; kw < kwSize; ++kw) {
for (int64_t w_iv = 0; w_iv < wSize; w_iv += wSizeStep) {		for (int64_t w = 0; w < wSize; w += wSizeStep) {
Value wVal = builder.create<arith::ConstantIndexOp>(loc, w_iv);		resVals[w] = conv1dSliceAsContraction(
// Write back res slice: {n, wSizeStep, f} @ [0, w, 0].		builder, loc, lhsVals[linearIndex(kw, w)], rhsVals[kw], resVals[w]);
write = builder.create<vector::TransferWriteOp>(
loc, resVals[kw * (wSize / wSizeStep) + w_iv], resShaped,
ValueRange{zero, wVal, zero});
if (write.getNumResults() == 1)
resShaped = write->getResult(0);
}		}
}		}

return write.getOperation();		// Write back res slice: {n, wSizeStep, f} @ [0, w, 0].
		// This does not depend on kw.
		for (int64_t w = 0; w < wSize; w += wSizeStep) {
		res = builder.create<vector::InsertStridedSliceOp>(
		loc, resVals[w], res,
		/offsets=/ArrayRef<int64_t>{0, w, 0},
		/strides=/ArrayRef<int64_t>{1, 1, 1});
		}
		//===------------------------------------------------------------------===//
		// End vector-only rewrite part
		//===------------------------------------------------------------------===//

		// Write back res slice of size {n, w, f} @ [0, 0, 0].
		return builder
		.create<vector::TransferWriteOp>(loc, res, resShaped,
		ValueRange{zero, zero, zero})
		.getOperation();
}		}

// Create a contraction: lhs{n, w, c} * rhs{c, f} -> res{n, w, f}		// Create a contraction: lhs{n, w, c} * rhs{c, f} -> res{n, w, f}
vector::ContractionOp conv1dSliceAsContraction(OpBuilder &b, Location loc,		vector::ContractionOp conv1dSliceAsContraction(OpBuilder &b, Location loc,
Value lhs, Value rhs,		Value lhs, Value rhs,
Value res) {		Value res) {
StringRef par = Par().strRef, red = Red().strRef;		StringRef par = Par().strRef, red = Red().strRef;
AffineExpr n, w, f, c;		AffineExpr n, w, f, c;
▲ Show 20 Lines • Show All 71 Lines • Show Last 20 Lines

mlir/test/Dialect/Linalg/vectorize-convolution.mlir

	Show All 10 Lines
	// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>			// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
	// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>			// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
	// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>			// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>

	// CHECK: func @conv1d_nwc_4x2x8_memref			// CHECK: func @conv1d_nwc_4x2x8_memref
	// CHECK-SAME: (%[[INPUT:.+]]: memref<4x6x3xf32>, %[[FILTER:.+]]: memref<1x3x8xf32>, %[[OUTPUT:.+]]: memref<4x2x8xf32>)			// CHECK-SAME: (%[[INPUT:.+]]: memref<4x6x3xf32>, %[[FILTER:.+]]: memref<1x3x8xf32>, %[[OUTPUT:.+]]: memref<4x2x8xf32>)

	// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index			// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
	// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
	// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
	// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32			// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32

				/// Read the whole data in one shot.
				// CHECK-DAG: %[[V_INPUT_R:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_FILTER_R:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_OUTPUT_R:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]

				// CHECK: %[[V_FILTER:.+]] = vector.extract %[[V_FILTER_R]][0] : vector<1x3x8xf32>
	/// w == 0, kw == 0			/// w == 0, kw == 0
	// CHECK: %[[V_FILTER:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]			// CHECK: %[[V_INPUT_0:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
	// CHECK: %[[V_INPUT0:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]			// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x5x3xf32> to vector<4x1x3xf32>
	// CHECK: %[[V_OUTPUT_0:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]			// CHECK: %[[V_OUTPUT_0:.+]] = vector.extract_strided_slice %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 1, 8], strides = [1, 1, 1]} : vector<4x2x8xf32> to vector<4x1x8xf32>
	/// w == 1, kw == 0			/// w == 1, kw == 0
	// CHECK: %[[V_INPUT3:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C3]], %[[C0]]], %[[F0]]			// CHECK: %[[V_INPUT_1:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
	// CHECK: %[[V_OUTPUT_1:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C1]], %[[C0]]], %[[F0]]			// CHECK-SAME: {offsets = [0, 3, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x5x3xf32> to vector<4x1x3xf32>
	// CHECK: %[[CONTRACT0:.+]] = vector.contract {			// CHECK: %[[V_OUTPUT_1:.+]] = vector.extract_strided_slice %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 1, 0], sizes = [4, 1, 8], strides = [1, 1, 1]} : vector<4x2x8xf32> to vector<4x1x8xf32>

	/// w == 0, kw == 0			/// w == 0, kw == 0
				// CHECK: %[[CONTRACT_0:.+]] = vector.contract {
	// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],			// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}			// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
	// CHECK-SAME: %[[V_INPUT0]], %[[V_FILTER]], %[[V_OUTPUT_0]]			// CHECK-SAME: %[[V_INPUT_0]], %[[V_FILTER]], %[[V_OUTPUT_0]]
	// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>			// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>

	/// w == 1, kw == 0			/// w == 1, kw == 0
	// CHECK: %[[CONTRACT1:.+]] = vector.contract {			// CHECK: %[[CONTRACT_1:.+]] = vector.contract {
	// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],			// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}			// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
	// CHECK-SAME: %[[V_INPUT3]], %[[V_FILTER]], %[[V_OUTPUT_1]]			// CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER]], %[[V_OUTPUT_1]]
	// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>			// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>

	/// w == 0, kw == 0			/// w == 0, kw == 0
	// CHECK: vector.transfer_write %[[CONTRACT0]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]			// CHECK: %[[RES_0:.+]] = vector.insert_strided_slice %[[CONTRACT_0]], %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x1x8xf32> into vector<4x2x8xf32>
	/// w == 1, kw == 0			/// w == 1, kw == 0
	// CHECK: vector.transfer_write %[[CONTRACT1]], %[[OUTPUT]][%[[C0]], %[[C1]], %[[C0]]]			// CHECK: %[[RES_1:.+]] = vector.insert_strided_slice %[[CONTRACT_1]], %[[RES_0]]
				// CHECK-SAME: {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<4x1x8xf32> into vector<4x2x8xf32>

				// Write the result back in one shot.
				// CHECK: vector.transfer_write %[[RES_1]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]

	// -----			// -----

	func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<2x3x8xf32>, %output: memref<4x2x8xf32>) {			func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<2x3x8xf32>, %output: memref<4x2x8xf32>) {
	linalg.conv_1d_nwc_wcf			linalg.conv_1d_nwc_wcf
	{dilations = dense<2> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}			{dilations = dense<2> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}
	ins(%input, %filter : memref<4x6x3xf32>, memref<2x3x8xf32>)			ins(%input, %filter : memref<4x6x3xf32>, memref<2x3x8xf32>)
	outs(%output : memref<4x2x8xf32>)			outs(%output : memref<4x2x8xf32>)
	return			return
	}			}

	// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>			// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
	// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>			// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
	// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>			// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>

	// CHECK: func @conv1d_nwc_4x2x8_memref			// CHECK: func @conv1d_nwc_4x2x8_memref
	// CHECK-SAME: (%[[INPUT:.+]]: memref<4x6x3xf32>, %[[FILTER:.+]]: memref<2x3x8xf32>, %[[OUTPUT:.+]]: memref<4x2x8xf32>)			// CHECK-SAME: (%[[INPUT:.+]]: memref<4x6x3xf32>, %[[FILTER:.+]]: memref<2x3x8xf32>, %[[OUTPUT:.+]]: memref<4x2x8xf32>)

	// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index			// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
	// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
	// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
	// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
	// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
	// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32			// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32

				/// Read the whole data in one shot.
				// CHECK-DAG: %[[V_INPUT_R:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_FILTER_R:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_OUTPUT_R:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]


	/// w == 0, kw == 0			/// w == 0, kw == 0
	// CHECK: %[[V_FILTER_A:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]			// CHECK: %[[V_FILTER_0:.+]] = vector.extract %[[V_FILTER_R]][0] : vector<2x3x8xf32>
	// CHECK: %[[V_INPUT0_A:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]			// CHECK: %[[V_INPUT_0:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
	// CHECK: %[[V_OUTPUT_0_A:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]			// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x7x3xf32> to vector<4x1x3xf32>
				// CHECK: %[[V_OUTPUT_0:.+]] = vector.extract_strided_slice %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 1, 8], strides = [1, 1, 1]} : vector<4x2x8xf32> to vector<4x1x8xf32>
				/// w == 1, kw == 0
				// CHECK: %[[V_INPUT_1:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 3, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x7x3xf32> to vector<4x1x3xf32>
				// CHECK: %[[V_OUTPUT_1:.+]] = vector.extract_strided_slice %[[V_OUTPUT_R]]
				// CHECK-SAME: {offsets = [0, 1, 0], sizes = [4, 1, 8], strides = [1, 1, 1]} : vector<4x2x8xf32> to vector<4x1x8xf32>

	/// w == 0, kw == 1			/// w == 0, kw == 1
	// CHECK: %[[V_INPUT3_A:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C3]], %[[C0]]], %[[F0]]			// CHECK: %[[V_FILTER_1:.+]] = vector.extract %[[V_FILTER_R]][1] : vector<2x3x8xf32>
	// CHECK: %[[V_OUTPUT_1_A:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C1]], %[[C0]]], %[[F0]]			// CHECK: %[[V_INPUT_2:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
				// CHECK-SAME: {offsets = [0, 2, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x7x3xf32> to vector<4x1x3xf32>
	/// w == 1, kw == 0			/// w == 1, kw == 0
	// CHECK: %[[V_FILTER_B:.+]] = vector.transfer_read %[[FILTER]][%[[C1]], %[[C0]], %[[C0]]], %[[F0]]			// CHECK: %[[V_INPUT_3:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
	// CHECK: %[[V_INPUT0_B:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C2]], %[[C0]]], %[[F0]]			// CHECK-SAME: {offsets = [0, 5, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x7x3xf32> to vector<4x1x3xf32>
	// CHECK: %[[V_OUTPUT_0_B:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
	/// w == 1, kw == 1
	// CHECK: %[[V_INPUT3_B:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C5]], %[[C0]]], %[[F0]]
	// CHECK: %[[V_OUTPUT_1_B:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C1]], %[[C0]]], %[[F0]]

	/// w == 0, kw == 0			/// w == 0, kw == 0
	// CHECK: %[[CONTRACT0_A:.+]] = vector.contract {			// CHECK: %[[CONTRACT_0:.+]] = vector.contract {
	// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],			// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}			// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
	// CHECK-SAME: %[[V_INPUT0_A]], %[[V_FILTER_A]], %[[V_OUTPUT_0_A]]			// CHECK-SAME: %[[V_INPUT_0]], %[[V_FILTER_0]], %[[V_OUTPUT_0]]
	// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>			// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
	/// w == 0, kw == 1			/// w == 1, kw == 0
	// CHECK: %[[CONTRACT1_A:.+]] = vector.contract {			// CHECK: %[[CONTRACT_1:.+]] = vector.contract {
	// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],			// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}			// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
	// CHECK-SAME: %[[V_INPUT3_A]], %[[V_FILTER_A]], %[[V_OUTPUT_1_A]]			// CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER_0]], %[[V_OUTPUT_1]]
	// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>			// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
	/// w == 1, kw == 0			/// w == 1, kw == 1
	// CHECK: %[[CONTRACT0_B:.+]] = vector.contract {			// CHECK: %[[CONTRACT_2:.+]] = vector.contract {
	// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],			// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}			// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
	// CHECK-SAME: %[[V_INPUT0_B]], %[[V_FILTER_B]], %[[V_OUTPUT_0_B]]			// CHECK-SAME: %[[V_INPUT_2]], %[[V_FILTER_1]], %[[CONTRACT_0]]
	// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>			// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
	/// w == 1, kw == 1			/// w == 1, kw == 1
	// CHECK: %[[CONTRACT1_B:.+]] = vector.contract {			// CHECK: %[[CONTRACT_3:.+]] = vector.contract {
	// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],			// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}			// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
	// CHECK-SAME: %[[V_INPUT3_B]], %[[V_FILTER_B]], %[[V_OUTPUT_1_B]]			// CHECK-SAME: %[[V_INPUT_3]], %[[V_FILTER_1]], %[[CONTRACT_1]]
	// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>			// CHECK-SAME: : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>

	/// w == 0, kw == 0			/// w == 0, kw == 0
	// CHECK: vector.transfer_write %[[CONTRACT0_A]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]			// CHECK: %[[RES_0:.+]] = vector.insert_strided_slice %[[CONTRACT_2]], %[[V_OUTPUT_R]]
	/// w == 0, kw == 1			// CHECK-SAME: {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x1x8xf32> into vector<4x2x8xf32>
	// CHECK: vector.transfer_write %[[CONTRACT1_A]], %[[OUTPUT]][%[[C0]], %[[C1]], %[[C0]]]
	/// w == 1, kw == 0			/// w == 1, kw == 0
	// CHECK: vector.transfer_write %[[CONTRACT0_B]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]			// CHECK: %[[RES_1:.+]] = vector.insert_strided_slice %[[CONTRACT_3]], %[[RES_0]]
	/// w == 1, kw == 1			// CHECK-SAME: {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<4x1x8xf32> into vector<4x2x8xf32>
	// CHECK: vector.transfer_write %[[CONTRACT1_B]], %[[OUTPUT]][%[[C0]], %[[C1]], %[[C0]]]
				// Write the result back in one shot.
				// CHECK: vector.transfer_write %[[RES_1]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]

	// -----			// -----

				func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<2x3x8xf32>, %output: memref<4x2x8xf32>) {
				linalg.conv_1d_nwc_wcf
				{dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
				ins(%input, %filter : memref<4x6x3xf32>, memref<2x3x8xf32>)
				outs(%output : memref<4x2x8xf32>)
				return
				}

	// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>			// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
	// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>			// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
	// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>			// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>

	// CHECK: func @conv1d_nwc_4x2x8_memref			// CHECK: func @conv1d_nwc_4x2x8_memref
	// CHECK-SAME: (%[[INPUT:.+]]: memref<4x6x3xf32>, %[[FILTER:.+]]: memref<2x3x8xf32>, %[[OUTPUT:.+]]: memref<4x2x8xf32>)			// CHECK-SAME: (%[[INPUT:.+]]: memref<4x6x3xf32>, %[[FILTER:.+]]: memref<2x3x8xf32>, %[[OUTPUT:.+]]: memref<4x2x8xf32>)
	func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<2x3x8xf32>, %output: memref<4x2x8xf32>) {
	// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index			// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
	// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
	// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
	// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32			// CHECK-DAG: %[[F0:.+]] = arith.constant 0.000000e+00 : f32

				/// Read the whole data in one shot.
				// CHECK-DAG: %[[V_INPUT_R:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_FILTER_R:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]
				// CHECK-DAG: %[[V_OUTPUT_R:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]

	/// w == 0, kw == 0			/// w == 0, kw == 0
	// CHECK: %[[V_FILTER_000:.+]] = vector.transfer_read %[[FILTER]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]{{.*}} vector<3x8xf32>			// CHECK: %[[V_FILTER_0:.+]] = vector.extract %[[V_FILTER_R]][0] : vector<2x3x8xf32>
	// CHECK: %[[V_INPUT_000:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]{{.*}} vector<4x2x3xf32>			// CHECK: %[[V_INPUT_0:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
	// CHECK: %[[V_OUTPUT_0:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]{{.*}} vector<4x2x8xf32>			// CHECK-SAME: {offsets = [0, 0, 0], sizes = [4, 2, 3], strides = [1, 1, 1]} : vector<4x5x3xf32> to vector<4x2x3xf32>
	/// w == 0, kw == 1			/// w == 0, kw == 1
	// CHECK: %[[V_FILTER_100:.+]] = vector.transfer_read %[[FILTER]][%[[C1]], %[[C0]], %[[C0]]], %[[F0]]{{.*}} vector<3x8xf32>			// CHECK: %[[V_FILTER_1:.+]] = vector.extract %[[V_FILTER_R]][1] : vector<2x3x8xf32>
	// CHECK: %[[V_INPUT_020:.+]] = vector.transfer_read %[[INPUT]][%[[C0]], %[[C2]], %[[C0]]], %[[F0]]{{.*}} vector<4x2x3xf32>			// CHECK: %[[V_INPUT_1:.+]] = vector.extract_strided_slice %[[V_INPUT_R]]
	// CHECK: %[[V_OUTPUT_1:.+]] = vector.transfer_read %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]], %[[F0]]{{.*}} vector<4x2x8xf32>			// CHECK-SAME: {offsets = [0, 2, 0], sizes = [4, 2, 3], strides = [1, 1, 1]} : vector<4x5x3xf32> to vector<4x2x3xf32>

	/// w == 0, kw == 0			/// w == 0, kw == 0
	// CHECK: %[[CONTRACT0:.+]] = vector.contract {			// CHECK: %[[CONTRACT_0:.+]] = vector.contract {
	// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],			// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}			// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
	// CHECK-SAME: %[[V_INPUT_000]], %[[V_FILTER_000]], %[[V_OUTPUT_0]]			// CHECK-SAME: %[[V_INPUT_0]], %[[V_FILTER_0]], %[[V_OUTPUT_R]]
	// CHECK-SAME: : vector<4x2x3xf32>, vector<3x8xf32> into vector<4x2x8xf32>			// CHECK-SAME: : vector<4x2x3xf32>, vector<3x8xf32> into vector<4x2x8xf32>
	/// w == 0, kw == 1			/// w == 1, kw == 1
	// CHECK: %[[CONTRACT1:.+]] = vector.contract {			// CHECK: %[[CONTRACT_1:.+]] = vector.contract {
	// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],			// CHECK-SAME: indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
	// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}			// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
	// CHECK-SAME: %[[V_INPUT_020]], %[[V_FILTER_100]], %[[V_OUTPUT_1]]			// CHECK-SAME: %[[V_INPUT_1]], %[[V_FILTER_1]], %[[CONTRACT_0]]
	// CHECK-SAME: : vector<4x2x3xf32>, vector<3x8xf32> into vector<4x2x8xf32>			// CHECK-SAME: : vector<4x2x3xf32>, vector<3x8xf32> into vector<4x2x8xf32>

	/// w == 0, kw == 0			// Write the result back in one shot.
	// CHECK: vector.transfer_write %[[CONTRACT0]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]			// CHECK: vector.transfer_write %[[CONTRACT_1]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]
	/// w == 0, kw == 1
	// CHECK: vector.transfer_write %[[CONTRACT1]], %[[OUTPUT]][%[[C0]], %[[C0]], %[[C0]]]
	linalg.conv_1d_nwc_wcf
	{dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
	ins(%input, %filter : memref<4x6x3xf32>, memref<2x3x8xf32>)
	outs(%output : memref<4x2x8xf32>)
	return
	}