Diff 385285

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

Show All 10 Lines

#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"		#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"		#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"		#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/Utils.h"		#include "mlir/Dialect/SCF/Utils.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"		#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"		#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Vector/VectorTransforms.h"		#include "mlir/Dialect/Vector/VectorTransforms.h"
		#include "mlir/Dialect/X86Vector/Transforms.h"
#include "mlir/IR/Identifier.h"		#include "mlir/IR/Identifier.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"
#include "mlir/Transforms/Bufferize.h"		#include "mlir/Transforms/Bufferize.h"
#include "llvm/ADT/SmallBitVector.h"		#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallSet.h"		#include "llvm/ADT/SmallSet.h"

namespace mlir {		namespace mlir {
class BufferizeTypeConverter;		class BufferizeTypeConverter;
▲ Show 20 Lines • Show All 986 Lines • ▼ Show 20 Lines	struct LinalgVectorLoweringOptions {
}		}
/// Enable lowering of vector.transpose.		/// Enable lowering of vector.transpose.
/// In a progressive lowering of vectors, this would be the 7th step.		/// In a progressive lowering of vectors, this would be the 7th step.
bool transposeLowering = false;		bool transposeLowering = false;
LinalgVectorLoweringOptions &enableVectorTransposeLowering(bool val = true) {		LinalgVectorLoweringOptions &enableVectorTransposeLowering(bool val = true) {
transposeLowering = val;		transposeLowering = val;
return *this;		return *this;
}		}
		/// Enable AVX2-specific lowerings.
		bool avx2Lowering = false;
		LinalgVectorLoweringOptions &enableAVX2Lowering(bool val = true) {
		avx2Lowering = val;
		return *this;
		}

/// Configure the post staged-patterns late vector.transfer to scf		/// Configure the post staged-patterns late vector.transfer to scf
/// conversion.		/// conversion.
VectorTransferToSCFOptions vectorTransferToSCFOptions;		VectorTransferToSCFOptions vectorTransferToSCFOptions;
LinalgVectorLoweringOptions &		LinalgVectorLoweringOptions &
setVectorTransferToSCFOptions(VectorTransferToSCFOptions options) {		setVectorTransferToSCFOptions(VectorTransferToSCFOptions options) {
vectorTransferToSCFOptions = options;		vectorTransferToSCFOptions = options;
return *this;		return *this;
}		}
/// Configure late vector transformations.		/// Configure late vector transformations.
vector::VectorTransformsOptions vectorTransformOptions;		vector::VectorTransformsOptions vectorTransformOptions;
LinalgVectorLoweringOptions &		LinalgVectorLoweringOptions &
setVectorTransformsOptions(vector::VectorTransformsOptions options) {		setVectorTransformsOptions(vector::VectorTransformsOptions options) {
vectorTransformOptions = options;		vectorTransformOptions = options;
return *this;		return *this;
}		}
		/// Configure specialized vector lowerings.
		x86vector::avx2::LoweringOptions avx2LoweringOptions;
		LinalgVectorLoweringOptions &
		setAVX2LoweringOptions(x86vector::avx2::LoweringOptions options) {
		avx2LoweringOptions = options;
		return *this;
		}
};		};

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Transformations exposed as rewrite patterns.		// Transformations exposed as rewrite patterns.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
/// Trait to check if T provides a `getOperationName` method.		/// Trait to check if T provides a `getOperationName` method.
template <typename T, typename... Args>		template <typename T, typename... Args>
using has_get_operation_name = decltype(T::getOperationName());		using has_get_operation_name = decltype(T::getOperationName());
▲ Show 20 Lines • Show All 332 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/X86Vector/Transforms.h

	//=- Transforms.h - X86Vector Dialect Transformation Entrypoints -- C++ --=//			//=- Transforms.h - X86Vector Dialect Transformation Entrypoints -- C++ --=//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_X86VECTOR_TRANSFORMS_H			#ifndef MLIR_DIALECT_X86VECTOR_TRANSFORMS_H
	#define MLIR_DIALECT_X86VECTOR_TRANSFORMS_H			#define MLIR_DIALECT_X86VECTOR_TRANSFORMS_H

				#include "mlir/IR/Value.h"

	namespace mlir {			namespace mlir {

				class ImplicitLocOpBuilder;
	class LLVMConversionTarget;			class LLVMConversionTarget;
	class LLVMTypeConverter;			class LLVMTypeConverter;
	class RewritePatternSet;			class RewritePatternSet;
	using OwningRewritePatternList = RewritePatternSet;			using OwningRewritePatternList = RewritePatternSet;

				namespace x86vector {

				/// Helper class to factor out the creation and extraction of masks from nibs.
				struct MaskHelper {
				/// b01 captures the lower 2 bits, b67 captures the higher 2 bits.
				/// Meant to be used with instructions such as mm256ShufflePs.
				template <int b67, int b45, int b23, int b01>
				aartbikUnsubmitted Done Reply Inline Actions how about making these unsigned (which also makes your subsequent overflow asserts correct, since this would pass for negative numbers ;-) aartbik: how about making these unsigned (which also makes your subsequent overflow asserts correct…
				static char shuffle() {
				static_assert(b01 <= 0x03, "overflow");
				static_assert(b23 <= 0x03, "overflow");
				static_assert(b45 <= 0x03, "overflow");
				static_assert(b67 <= 0x03, "overflow");
				return (b67 << 6) + (b45 << 4) + (b23 << 2) + b01;
				}
				/// b01 captures the lower 2 bits, b67 captures the higher 2 bits.
				static void extractShuffle(char mask, char &b01, char &b23, char &b45,
				char &b67) {
				b67 = (mask & (0x03 << 6)) >> 6;
				b45 = (mask & (0x03 << 4)) >> 4;
				b23 = (mask & (0x03 << 2)) >> 2;
				b01 = mask & 0x03;
				}
				/// b03 captures the lower 4 bits, b47 captures the higher 4 bits.
				/// Meant to be used with instructions such as mm256Permute2f128Ps.
				template <int b47, int b03>
				static char permute() {
				static_assert(b03 <= 0x0f, "overflow");
				static_assert(b47 <= 0x0f, "overflow");
				return (b47 << 4) + b03;
				}
				/// b03 captures the lower 4 bits, b47 captures the higher 4 bits.
				static void extractPermute(char mask, char &b03, char &b47) {
				b47 = (mask & (0x0f << 4)) >> 4;
				b03 = mask & 0x0f;
				}
				};

				//===----------------------------------------------------------------------===//
				/// Helpers extracted from:
				/// - clang/lib/Headers/avxintrin.h
				/// - clang/test/CodeGen/X86/avx-builtins.c
				/// - clang/test/CodeGen/X86/avx2-builtins.c
				/// - clang/test/CodeGen/X86/avx-shuffle-builtins.c
				/// as well as the Intel Intrinsics Guide
				/// (https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html)
				/// make it easier to just implement known good lowerings.
				/// All intrinsics correspond 1-1 to the Intel definition.
				//===----------------------------------------------------------------------===//

				namespace avx2 {

				/// Lower to vector.shuffle v1, v2, [0, 8, 1, 9, 4, 12, 5, 13].
				Value mm256UnpackLoPs(ImplicitLocOpBuilder &b, Value v1, Value v2);

				/// Lower to vector.shuffle v1, v2, [0, 8, 1, 9, 4, 12, 5, 13].
				Value mm256UnpackHiPs(ImplicitLocOpBuilder &b, Value v1, Value v2);

				/// a a b b a a b b
				/// Take an 8 bit mask, 2 bit for each position of a[0, 3) and b[0, 4):
				/// 0:127 \| 128:255
				/// b01 b23 C8 D8 \| b01+4 b23+4 C8+4 D8+4
				Value mm256ShufflePs(ImplicitLocOpBuilder &b, Value v1, Value v2, char mask);

				// imm[0:1] out of imm[0:3] is:
				// 0 1 2 3
				// a[0:127] or a[128:255] or b[0:127] or b[128:255] \|
				// a[0:127] or a[128:255] or b[0:127] or b[128:255]
				// 0 1 2 3
				// imm[0:1] out of imm[4:7].
				Value mm256Permute2f128Ps(ImplicitLocOpBuilder &b, Value v1, Value v2,
				char mask);

				/// 4x8xf32-specific AVX2 transpose lowering.
				void transpose4x8xf32(ImplicitLocOpBuilder &ib, MutableArrayRef<Value> vs);

				/// 8x8xf32-specific AVX2 transpose lowering.
				void transpose8x8xf32(ImplicitLocOpBuilder &ib, MutableArrayRef<Value> vs);

				/// Structure to control the behavior of specialized avx2 transpose lowering.
				dcaballeUnsubmitted Done Reply Inline Actions nit: avx2 -> AVX2 for consistency dcaballe: nit: avx2 -> AVX2 for consistency
				struct TransposeLoweringOptions {
				bool lower4x8xf32_ = false;
				TransposeLoweringOptions &lower4x8xf32(bool lower = true) {
				lower4x8xf32_ = lower;
				return *this;
				}
				bool lower8x8xf32_ = false;
				TransposeLoweringOptions &lower8x8xf32(bool lower = true) {
				lower8x8xf32_ = lower;
				return *this;
				}
				};

				/// Options for controlling specialized AVX2 lowerings.
				struct LoweringOptions {
				/// Configure specialized vector lowerings.
				TransposeLoweringOptions transposeOptions;
				LoweringOptions &setTransposeOptions(TransposeLoweringOptions options) {
				transposeOptions = options;
				return *this;
				}
				};

				/// Insert specialized transpose lowering patterns.
				void populateSpecializedTransposeLoweringPatterns(
				RewritePatternSet &patterns, LoweringOptions options = LoweringOptions(),
				int benefit = 10);

				} // namespace avx2
				} // namespace x86vector

	/// Collect a set of patterns to lower X86Vector ops to ops that map to LLVM			/// Collect a set of patterns to lower X86Vector ops to ops that map to LLVM
	/// intrinsics.			/// intrinsics.
	void populateX86VectorLegalizeForLLVMExportPatterns(			void populateX86VectorLegalizeForLLVMExportPatterns(
	LLVMTypeConverter &converter, RewritePatternSet &patterns);			LLVMTypeConverter &converter, RewritePatternSet &patterns);

	/// Configure the target to support lowering X86Vector ops to ops that map to			/// Configure the target to support lowering X86Vector ops to ops that map to
	/// LLVM intrinsics.			/// LLVM intrinsics.
	void configureX86VectorLegalizeForExportTarget(LLVMConversionTarget &target);			void configureX86VectorLegalizeForExportTarget(LLVMConversionTarget &target);

	} // namespace mlir			} // namespace mlir

	#endif // MLIR_DIALECT_X86VECTOR_TRANSFORMS_H			#endif // MLIR_DIALECT_X86VECTOR_TRANSFORMS_H

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

Show First 20 Lines • Show All 79 Lines • ▼ Show 20 Lines	add_mlir_dialect_library(MLIRLinalgTransforms
MLIRPass		MLIRPass
MLIRStandard		MLIRStandard
MLIRStandardOpsTransforms		MLIRStandardOpsTransforms
MLIRStandardToLLVM		MLIRStandardToLLVM
MLIRTensor		MLIRTensor
MLIRTransforms		MLIRTransforms
MLIRTransformUtils		MLIRTransformUtils
MLIRVector		MLIRVector
		MLIRX86VectorTransforms
MLIRVectorToSCF		MLIRVectorToSCF
)		)

mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp

Show First 20 Lines • Show All 290 Lines • ▼ Show 20 Lines	struct LinalgStrategyLowerVectorsPass

void runOnFunction() override {		void runOnFunction() override {
auto funcOp = getFunction();		auto funcOp = getFunction();
if (!anchorFuncName.empty() && funcOp.getName() != anchorFuncName)		if (!anchorFuncName.empty() && funcOp.getName() != anchorFuncName)
return;		return;

MLIRContext *context = funcOp.getContext();		MLIRContext *context = funcOp.getContext();
RewritePatternSet patterns(context);		RewritePatternSet patterns(context);
vector::populateVectorToVectorCanonicalizationPatterns(patterns);		vector::populateVectorToVectorCanonicalizationPatterns(patterns);
		dcaballeUnsubmitted Done Reply Inline Actions what happened here? Should we remove it? dcaballe: what happened here? Should we remove it?
		nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions oops, thanks! nicolasvasilache: oops, thanks!
// In a progressive lowering of vectors, this would be the 1st step.		// In a progressive lowering of vectors, this would be the 1st step.
if (options.contractionLowering) {		if (options.contractionLowering) {
patterns.add<ContractionOpToOuterProductOpLowering,		patterns.add<ContractionOpToOuterProductOpLowering,
ContractionOpToMatmulOpLowering, ContractionOpLowering>(		ContractionOpToMatmulOpLowering, ContractionOpLowering>(
options.vectorTransformOptions, context);		options.vectorTransformOptions, context);
vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);		vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
}		}
// In a progressive lowering of vectors, this would be the 2nd step.		// In a progressive lowering of vectors, this would be the 2nd step.
Show All 21 Lines	void runOnFunction() override {
// In a progressive lowering of vectors, this would be the 6th step.		// In a progressive lowering of vectors, this would be the 6th step.
if (options.shapeCastLowering) {		if (options.shapeCastLowering) {
vector::populateVectorShapeCastLoweringPatterns(patterns);		vector::populateVectorShapeCastLoweringPatterns(patterns);
}		}
// In a progressive lowering of vectors, this would be the 7th step.		// In a progressive lowering of vectors, this would be the 7th step.
if (options.transposeLowering) {		if (options.transposeLowering) {
vector::populateVectorTransposeLoweringPatterns(		vector::populateVectorTransposeLoweringPatterns(
patterns, options.vectorTransformOptions);		patterns, options.vectorTransformOptions);
		if (options.avx2Lowering)
		x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
		patterns, options.avx2LoweringOptions);
}		}
(void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));		(void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
}		}

LinalgVectorLoweringOptions options;		LinalgVectorLoweringOptions options;
LinalgTransformationFilter filter;		LinalgTransformationFilter filter;
};		};

▲ Show 20 Lines • Show All 80 Lines • Show Last 20 Lines

mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp

This file was added.

				//===- AVXTranspose.cpp - Transforms from Vector to X86Vector dialects ----===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file implements target-independent rewrites as 1->N patterns.
				//
				aartbikUnsubmitted Done Reply Inline Actions The filename seems to imply it is specific to AVX target and the transpose op. Perhaps make the L9 and L1 description more consistent with that? aartbik: The filename seems to imply it is specific to AVX target and the transpose op. Perhaps make the…
				//===----------------------------------------------------------------------===//

				#include "mlir/Dialect/Vector/VectorOps.h"
				#include "mlir/Dialect/X86Vector/Transforms.h"
				#include "mlir/IR/ImplicitLocOpBuilder.h"
				#include "mlir/IR/Matchers.h"
				#include "mlir/IR/PatternMatch.h"

				using namespace mlir;
				using namespace mlir::vector;
				using namespace mlir::x86vector;
				using namespace mlir::x86vector::avx2;

				Value mlir::x86vector::avx2::mm256UnpackLoPs(ImplicitLocOpBuilder &b, Value v1,
				Value v2) {
				return b.create<vector::ShuffleOp>(
				v1, v2, ArrayRef<int64_t>{0, 8, 1, 9, 4, 12, 5, 13});
				}

				Value mlir::x86vector::avx2::mm256UnpackHiPs(ImplicitLocOpBuilder &b, Value v1,
				Value v2) {
				return b.create<vector::ShuffleOp>(
				v1, v2, ArrayRef<int64_t>{2, 10, 3, 11, 6, 14, 7, 15});
				}
				/// a a b b a a b b
				/// Takes an 8 bit mask, 2 bit for each position of a[0, 3) and b[0, 4):
				/// 0:127 \| 128:255
				/// b01 b23 C8 D8 \| b01+4 b23+4 C8+4 D8+4
				Value mlir::x86vector::avx2::mm256ShufflePs(ImplicitLocOpBuilder &b, Value v1,
				Value v2, char mask) {
				char b01, b23, b45, b67;
				MaskHelper::extractShuffle(mask, b01, b23, b45, b67);
				SmallVector<int64_t> shuffleMask{b01, b23, b45 + 8, b67 + 8,
				b01 + 4, b23 + 4, b45 + 8 + 4, b67 + 8 + 4};
				return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
				}

				// imm[0:1] out of imm[0:3] is:
				// 0 1 2 3
				// a[0:127] or a[128:255] or b[0:127] or b[128:255] \|
				// a[0:127] or a[128:255] or b[0:127] or b[128:255]
				// 0 1 2 3
				// imm[0:1] out of imm[4:7].
				Value mlir::x86vector::avx2::mm256Permute2f128Ps(ImplicitLocOpBuilder &b,
				Value v1, Value v2,
				char mask) {
				SmallVector<int64_t> shuffleMask;
				auto appendToMask = [&](char control) {
				if (control == 0)
				llvm::append_range(shuffleMask, ArrayRef<int64_t>{0, 1, 2, 3});
				else if (control == 1)
				llvm::append_range(shuffleMask, ArrayRef<int64_t>{4, 5, 6, 7});
				else if (control == 2)
				llvm::append_range(shuffleMask, ArrayRef<int64_t>{8, 9, 10, 11});
				else if (control == 3)
				llvm::append_range(shuffleMask, ArrayRef<int64_t>{12, 13, 14, 15});
				else
				llvm_unreachable("control > 3 : overflow");
				};
				char b03, b47;
				MaskHelper::extractPermute(mask, b03, b47);
				appendToMask(b03);
				appendToMask(b47);
				return b.create<vector::ShuffleOp>(v1, v2, shuffleMask);
				}

				/// AVX2 4x8xf32-specific transpose lowering using a "C intrinsics" model.
				void mlir::x86vector::avx2::transpose4x8xf32(ImplicitLocOpBuilder &ib,
				MutableArrayRef<Value> vs) {
				auto vt = VectorType::get({8}, Float32Type::get(ib.getContext()));
				(void)vt;
				assert(vs.size() == 4 && "expects 4 vectors");
				aartbikUnsubmitted Done Reply Inline Actions I am guessing this is there to avoid "unused variable" errors in no NDEBUG mode that removes the asserts? aartbik: I am guessing this is there to avoid "unused variable" errors in no NDEBUG mode that removes…
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions yup. nicolasvasilache: yup.
				mehdi_aminiUnsubmitted Not Done Reply Inline Actions The `#ifndef NDEBUG` alternative would also eliminate the two uniquing calls / lookup in prod here (the optimizer won't remove these side-effecting `get()`) mehdi_amini: The `#ifndef NDEBUG` alternative would also eliminate the two uniquing calls / lookup in prod…
				assert(llvm::all_of(ValueRange{vs}.getTypes(),
				[&](Type t) { return t == vt; }) &&
				"expects all types to be vector<8xf32>");

				Value T0 = mm256UnpackLoPs(ib, vs[0], vs[1]);
				Value T1 = mm256UnpackHiPs(ib, vs[0], vs[1]);
				Value T2 = mm256UnpackLoPs(ib, vs[2], vs[3]);
				Value T3 = mm256UnpackHiPs(ib, vs[2], vs[3]);
				Value S0 = mm256ShufflePs(ib, T0, T2, MaskHelper::shuffle<1, 0, 1, 0>());
				Value S1 = mm256ShufflePs(ib, T0, T2, MaskHelper::shuffle<3, 2, 3, 2>());
				dcaballeUnsubmitted Done Reply Inline Actions Just a philosophical comment on whether we want to promote this way of describing masks. Even though these are AVX2 specific intrinsics, this mask format is so misleading if you compare it with how the shuffle mask is represented in MLIR and LLVM... I guess, it will facilitate the portability of code based on AVX2 intrinsics. That's an important point. dcaballe: Just a philosophical comment on whether we want to promote this way of describing masks. Even…
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions In the end, masks lower to a single i8 and anyone can use `0xCE`. However I will strongly push back against writing the compiler this way because it is hyper-obfuscating (in fact, even C intrinsics have an MM_SHUFFLE macro to try and improve things). Depending on the instruction / instrinsic the 8bits are interpreted in various confusing ways.. My position is that the alternative is so terrible, that we should absolutely promote this way of describing masks. Translating existing code may be trickier but it will also force anyone doing this translation understand what it is doing. nicolasvasilache: In the end, masks lower to a single i8 and anyone can use `0xCE`. However I will strongly push…
				Value S2 = mm256ShufflePs(ib, T1, T3, MaskHelper::shuffle<1, 0, 1, 0>());
				Value S3 = mm256ShufflePs(ib, T1, T3, MaskHelper::shuffle<3, 2, 3, 2>());
				vs[0] = mm256Permute2f128Ps(ib, S0, S1, MaskHelper::permute<2, 0>());
				vs[1] = mm256Permute2f128Ps(ib, S2, S3, MaskHelper::permute<2, 0>());
				vs[2] = mm256Permute2f128Ps(ib, S0, S1, MaskHelper::permute<3, 1>());
				vs[3] = mm256Permute2f128Ps(ib, S2, S3, MaskHelper::permute<3, 1>());
				}

				/// AVX2 8x8xf32-specific transpose lowering using a "C intrinsics" model.
				void mlir::x86vector::avx2::transpose8x8xf32(ImplicitLocOpBuilder &ib,
				MutableArrayRef<Value> vs) {
				auto vt = VectorType::get({8}, Float32Type::get(ib.getContext()));
				(void)vt;
				assert(vs.size() == 8 && "expects 8 vectors");
				assert(llvm::all_of(ValueRange{vs}.getTypes(),
				[&](Type t) { return t == vt; }) &&
				"expects all types to be vector<8xf32>");

				Value T0 = mm256UnpackLoPs(ib, vs[0], vs[1]);
				Value T1 = mm256UnpackHiPs(ib, vs[0], vs[1]);
				Value T2 = mm256UnpackLoPs(ib, vs[2], vs[3]);
				Value T3 = mm256UnpackHiPs(ib, vs[2], vs[3]);
				Value T4 = mm256UnpackLoPs(ib, vs[4], vs[5]);
				Value T5 = mm256UnpackHiPs(ib, vs[4], vs[5]);
				Value T6 = mm256UnpackLoPs(ib, vs[6], vs[7]);
				Value T7 = mm256UnpackHiPs(ib, vs[6], vs[7]);
				Value S0 = mm256ShufflePs(ib, T0, T2, MaskHelper::shuffle<1, 0, 1, 0>());
				Value S1 = mm256ShufflePs(ib, T0, T2, MaskHelper::shuffle<3, 2, 3, 2>());
				Value S2 = mm256ShufflePs(ib, T1, T3, MaskHelper::shuffle<1, 0, 1, 0>());
				Value S3 = mm256ShufflePs(ib, T1, T3, MaskHelper::shuffle<3, 2, 3, 2>());
				Value S4 = mm256ShufflePs(ib, T4, T6, MaskHelper::shuffle<1, 0, 1, 0>());
				Value S5 = mm256ShufflePs(ib, T4, T6, MaskHelper::shuffle<3, 2, 3, 2>());
				Value S6 = mm256ShufflePs(ib, T5, T7, MaskHelper::shuffle<1, 0, 1, 0>());
				Value S7 = mm256ShufflePs(ib, T5, T7, MaskHelper::shuffle<3, 2, 3, 2>());
				vs[0] = mm256Permute2f128Ps(ib, S0, S4, MaskHelper::permute<2, 0>());
				vs[1] = mm256Permute2f128Ps(ib, S1, S5, MaskHelper::permute<2, 0>());
				vs[2] = mm256Permute2f128Ps(ib, S2, S6, MaskHelper::permute<2, 0>());
				vs[3] = mm256Permute2f128Ps(ib, S3, S7, MaskHelper::permute<2, 0>());
				vs[4] = mm256Permute2f128Ps(ib, S0, S4, MaskHelper::permute<3, 1>());
				vs[5] = mm256Permute2f128Ps(ib, S1, S5, MaskHelper::permute<3, 1>());
				vs[6] = mm256Permute2f128Ps(ib, S2, S6, MaskHelper::permute<3, 1>());
				vs[7] = mm256Permute2f128Ps(ib, S3, S7, MaskHelper::permute<3, 1>());
				}

				/// Rewrite avx2-specific 2-D vector.transpose, for the supported cases and
				/// depending on the `TransposeLoweringOptions`.
				class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
				public:
				using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;

				TransposeOpLowering(LoweringOptions loweringOptions, MLIRContext *context,
				int benefit)
				: OpRewritePattern<vector::TransposeOp>(context, benefit),
				loweringOptions(loweringOptions) {}

				LogicalResult matchAndRewrite(vector::TransposeOp op,
				PatternRewriter &rewriter) const override {
				auto loc = op.getLoc();

				VectorType srcType = op.getVectorType();
				if (srcType.getRank() != 2)
				return rewriter.notifyMatchFailure(op, "Not a 2-D transpose");

				SmallVector<int64_t, 4> transp;
				for (auto attr : op.transp())
				transp.push_back(attr.cast<IntegerAttr>().getInt());
				if (transp[0] != 1 && transp[1] != 0)
				return rewriter.notifyMatchFailure(op, "Not a 2-D transpose permutation");

				int64_t m = srcType.getShape().front(), n = srcType.getShape().back();

				auto applyRewrite = [&]() {
				ImplicitLocOpBuilder ib(loc, rewriter);
				SmallVector<Value> vs;
				for (int i = 0; i < m; ++i)
				vs.push_back(ib.create<vector::ExtractOp>(op.vector(), i));
				aartbikUnsubmitted Done Reply Inline Actions int64_t for i, since m is that type? aartbik: int64_t for i, since m is that type?
				if (m == 4)
				transpose4x8xf32(ib, vs);
				if (m == 8)
				transpose8x8xf32(ib, vs);
				Value res = ib.create<arith::ConstantOp>(
				op.getVectorType(), ib.getZeroAttr(op.getVectorType()));
				for (int i = 0; i < m; ++i)
				res = ib.create<vector::InsertOp>(vs[i], res, i);
				rewriter.replaceOp(op, res);
				return success();
				};

				if (loweringOptions.transposeOptions.lower4x8xf32_ && m == 4 && n == 8)
				return applyRewrite();
				aartbikUnsubmitted Done Reply Inline Actions int64_t for i, since m is that type? aartbik: int64_t for i, since m is that type?
				if (loweringOptions.transposeOptions.lower8x8xf32_ && m == 8 && n == 8)
				return applyRewrite();
				return failure();
				}

				private:
				LoweringOptions loweringOptions;
				};

				void mlir::x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
				RewritePatternSet &patterns, LoweringOptions options, int benefit) {
				patterns.add<TransposeOpLowering>(options, patterns.getContext(), benefit);
				}

mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt

	add_mlir_dialect_library(MLIRX86VectorTransforms			add_mlir_dialect_library(MLIRX86VectorTransforms
				AVXTranspose.cpp
	LegalizeForLLVMExport.cpp			LegalizeForLLVMExport.cpp

	DEPENDS			DEPENDS
	MLIRX86VectorConversionsIncGen			MLIRX86VectorConversionsIncGen

	LINK_LIBS PUBLIC			LINK_LIBS PUBLIC
	MLIRArithmetic			MLIRArithmetic
	MLIRX86Vector			MLIRX86Vector
	MLIRIR			MLIRIR
	MLIRLLVMCommonConversion			MLIRLLVMCommonConversion
	MLIRLLVMIR			MLIRLLVMIR
				MLIRVector
	)			)

mlir/test/Dialect/Vector/vector-transpose-to-shuffle.mlir

	// RUN: mlir-opt %s -test-vector-contraction-conversion=vector-shuffle-transpose=1 \| FileCheck %s			// RUN: mlir-opt %s -test-vector-contraction-conversion=vector-shuffle-transpose=1
				//\| FileCheck %s

	// CHECK-LABEL: func @transpose			// CHECK-LABEL: func @transpose
	func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> {			func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> {
	// CHECK: vector.shape_cast %{{.*}} : vector<2x4xf32> to vector<8xf32>			// CHECK: vector.shape_cast %{{.*}} : vector<2x4xf32> to vector<8xf32>
	// 0 4			// 0 4
	// 0 1 2 3 1 5			// 0 1 2 3 1 5
	// 4 5 6 7 -> 2 6			// 4 5 6 7 -> 2 6
	// 3 7			// 3 7
	// CHECK: vector.shuffle %{{.*}} [0, 4, 1, 5, 2, 6, 3, 7] : vector<8xf32>, vector<8xf32>			// CHECK: vector.shuffle %{{.*}} [0, 4, 1, 5, 2, 6, 3, 7] : vector<8xf32>, vector<8xf32>
	// CHECK: vector.shape_cast %{{.*}} : vector<8xf32> to vector<4x2xf32>			// CHECK: vector.shape_cast %{{.*}} : vector<8xf32> to vector<4x2xf32>
	%0 = vector.transpose %arg0, [1, 0] : vector<2x4xf32> to vector<4x2xf32>			%0 = vector.transpose %arg0, [1, 0] : vector<2x4xf32> to vector<4x2xf32>
	return %0 : vector<4x2xf32>			return %0 : vector<4x2xf32>
	}			}

				// CHECK-LABEL: func @transpose8x8
				func @transpose8x8(%arg0: vector<8x8xf32>) -> vector<8x8xf32> {
				%0 = vector.transpose %arg0, [1, 0] : vector<8x8xf32> to vector<8x8xf32>
				return %0 : vector<8x8xf32>
				}

utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

Show First 20 Lines • Show All 1,452 Lines • ▼ Show 20 Lines	cc_library(
hdrs = ["include/mlir/Dialect/X86Vector/Transforms.h"],		hdrs = ["include/mlir/Dialect/X86Vector/Transforms.h"],
includes = ["include"],		includes = ["include"],
deps = [		deps = [
":ArithmeticDialect",		":ArithmeticDialect",
":IR",		":IR",
":LLVMCommonConversion",		":LLVMCommonConversion",
":LLVMDialect",		":LLVMDialect",
":StandardOps",		":StandardOps",
		":VectorOps",
":X86Vector",		":X86Vector",
"//llvm:Core",		"//llvm:Core",
"//llvm:Support",		"//llvm:Support",
],		],
)		)

gentbl_cc_library(		gentbl_cc_library(
name = "X86VectorConversionIncGen",		name = "X86VectorConversionIncGen",
▲ Show 20 Lines • Show All 4,924 Lines • ▼ Show 20 Lines	deps = [
":SCFTransforms",		":SCFTransforms",
":StandardOps",		":StandardOps",
":StandardOpsTransforms",		":StandardOpsTransforms",
":Support",		":Support",
":TensorDialect",		":TensorDialect",
":TransformUtils",		":TransformUtils",
":VectorOps",		":VectorOps",
":VectorToSCF",		":VectorToSCF",
		":X86VectorTransforms",
"//llvm:Support",		"//llvm:Support",
],		],
)		)

cc_library(		cc_library(
name = "TilingInterface",		name = "TilingInterface",
srcs = ["lib/Interfaces/TilingInterface.cpp"],		srcs = ["lib/Interfaces/TilingInterface.cpp"],
hdrs = ["include/mlir/Interfaces/TilingInterface.h"],		hdrs = ["include/mlir/Interfaces/TilingInterface.h"],
▲ Show 20 Lines • Show All 1,015 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][X86Vector] Add specialized vector.transpose lowering patterns for AVX2
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 385285

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/include/mlir/Dialect/X86Vector/Transforms.h

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp

mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp

mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt

mlir/test/Dialect/Vector/vector-transpose-to-shuffle.mlir

utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][X86Vector] Add specialized vector.transpose lowering patterns for AVX2ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 385285

mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h

mlir/include/mlir/Dialect/X86Vector/Transforms.h

mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt

mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp

mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp

mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt

mlir/test/Dialect/Vector/vector-transpose-to-shuffle.mlir

utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

[mlir][X86Vector] Add specialized vector.transpose lowering patterns for AVX2
ClosedPublic