diff --git a/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp b/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp --- a/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp +++ b/mlir/lib/Dialect/X86Vector/Transforms/AVXTranspose.cpp @@ -186,8 +186,56 @@ vs[7] = mm256Permute2f128Ps(ib, s3, s7, MaskHelper::permute<3, 1>()); } -/// Rewrite avx2-specific 2-D vector.transpose, for the supported cases and -/// depending on the `TransposeLoweringOptions`. +/// Given the n-D transpose pattern 'transp', return true if 'dim0' and 'dim1' +/// should be transposed with each other within the context of their 2D +/// transposition slice. +/// +/// Example 1: dim0 = 0, dim1 = 2, transp = [2, 1, 0] +/// Return true: dim0 and dim1 are transposed within the context of their 2D +/// transposition slice ([1, 0]). +/// +/// Example 2: dim0 = 0, dim1 = 1, transp = [2, 1, 0] +/// Return true: dim0 and dim1 are transposed within the context of their 2D +/// transposition slice ([1, 0]). Paradoxically, note how dim1 (1) is *not* +/// transposed within the full context of the transposition. +/// +/// Example 3: dim0 = 0, dim1 = 1, transp = [2, 0, 1] +/// Return false: dim0 and dim1 are *not* transposed within the context of +/// their 2D transposition slice ([0, 1]). Paradoxically, note how dim0 (0) +/// and dim1 (1) are transposed within the full context of the of the +/// transposition. +static bool areDimsTransposedIn2DSlice(int64_t dim0, int64_t dim1, + ArrayRef transp) { + // Perform a linear scan along the dimensions of the transposed pattern. If + // dim0 is found first, dim0 and dim1 are not transposed within the context of + // their 2D slice. Otherwise, 'dim1' is found first and they are transposed. + for (int64_t permDim : transp) { + if (permDim == dim0) + return false; + if (permDim == dim1) + return true; + } + + llvm_unreachable("Ill-formed transpose pattern"); +} + +/// Rewrite AVX2-specific vector.transpose, for the supported cases and +/// depending on the `TransposeLoweringOptions`. The lowering supports 2-D +/// transpose cases and n-D cases that have been decomposed into 2-D +/// transposition slices. For example, a 3-D transpose: +/// +/// %0 = vector.transpose %arg0, [2, 0, 1] +/// : vector<1024x2048x4096xf32> to vector<4096x1024x2048xf32> +/// +/// could be sliced into 2-D transposes by tiling two of its dimensions to one +/// of the vector lengths supported by the AVX2 patterns (e.g., 4x8): +/// +/// %0 = vector.transpose %arg0, [2, 0, 1] +/// : vector<1x4x8xf32> to vector<8x1x4xf32> +/// +/// This lowering will analyze the n-D vector.transpose and determine if it's a +/// supported 2-D transposition slice where any of the AVX2 patterns can be +/// applied. class TransposeOpLowering : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; @@ -201,42 +249,69 @@ PatternRewriter &rewriter) const override { auto loc = op.getLoc(); + // Check if the source vector type is supported. AVX2 patterns can only be + // applied if the vector type has two dimensions greater than one. VectorType srcType = op.getVectorType(); - if (srcType.getRank() != 2) - return rewriter.notifyMatchFailure(op, "Not a 2-D transpose"); + SmallVector srcGtOneDims; + for (auto &en : llvm::enumerate(srcType.getShape())) + if (en.value() > 1) + srcGtOneDims.push_back(en.index()); + + if (srcGtOneDims.size() != 2) + return rewriter.notifyMatchFailure(op, "Unsupported vector type"); SmallVector transp; for (auto attr : op.transp()) transp.push_back(attr.cast().getInt()); - if (transp[0] != 1 && transp[1] != 0) - return rewriter.notifyMatchFailure(op, "Not a 2-D transpose permutation"); - int64_t m = srcType.getShape().front(), n = srcType.getShape().back(); + // Check whether the two source vector dimensions that are greater than one + // must be transposed with each other so that we can apply one of the 2-D + // AVX2 transpose pattens. Otherwise, these patterns are not applicable. + if (!areDimsTransposedIn2DSlice(srcGtOneDims[0], srcGtOneDims[1], transp)) + return rewriter.notifyMatchFailure( + op, "Not applicable to this transpose permutation"); + + // Retrieve the sizes of the two dimensions greater than one to be + // transposed. + auto srcShape = srcType.getShape(); + int64_t m = srcShape[srcGtOneDims[0]], n = srcShape[srcGtOneDims[1]]; auto applyRewrite = [&]() { ImplicitLocOpBuilder ib(loc, rewriter); SmallVector vs; + + // Reshape the n-D input vector with only two dimensions greater than one + // to a 2-D vector. + auto flattenedType = + VectorType::get({n * m}, op.getVectorType().getElementType()); + auto reshInputType = VectorType::get({m, n}, srcType.getElementType()); + auto reshInput = + ib.create(flattenedType, op.vector()); + reshInput = ib.create(reshInputType, reshInput); + + // Extract 1-D vectors from the higher-order dimension of the input + // vector. for (int64_t i = 0; i < m; ++i) - vs.push_back(ib.create(op.vector(), i)); + vs.push_back(ib.create(reshInput, i)); + + // Transpose set of 1-D vectors. if (m == 4) transpose4x8xf32(ib, vs); if (m == 8) transpose8x8xf32(ib, vs); - auto flattenedType = - VectorType::get({n * m}, op.getVectorType().getElementType()); - auto transposedType = - VectorType::get({n, m}, op.getVectorType().getElementType()); - Value res = ib.create( - op.getVectorType(), ib.getZeroAttr(op.getVectorType())); - // The transposed form is still 4x8 and needs to be reinterpreted as 8x4 - // via shape_casts. + + // Insert transposed 1-D vectors into the higher-order dimension of the + // output vector. + Value res = ib.create(reshInputType, + ib.getZeroAttr(reshInputType)); for (int64_t i = 0; i < m; ++i) res = ib.create(vs[i], res, i); - if (m == 4) { - res = ib.create(flattenedType, res); - res = ib.create(transposedType, res); - } + // The output vector still has the shape of the input vector (e.g., 4x8). + // We have to transpose their dimensions and retrieve its original rank + // (e.g., 1x8x1x4x1). + res = ib.create(flattenedType, res); + res = ib.create(op.getResultType(), res); rewriter.replaceOp(op, res); return success(); }; diff --git a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir --- a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir +++ b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir @@ -1,7 +1,7 @@ -// RUN: mlir-opt %s -test-vector-transpose-lowering=eltwise=1 | FileCheck %s --check-prefix=ELTWISE -// RUN: mlir-opt %s -test-vector-transpose-lowering=shuffle=1 | FileCheck %s --check-prefix=SHUFFLE -// RUN: mlir-opt %s -test-vector-transpose-lowering=flat=1 | FileCheck %s --check-prefix=FLAT -// RUN: mlir-opt %s -test-vector-transpose-lowering=avx2=1 | FileCheck %s --check-prefix=AVX2 +// RUN: mlir-opt %s -test-vector-transpose-lowering=eltwise=1 -split-input-file | FileCheck %s --check-prefix=ELTWISE +// RUN: mlir-opt %s -test-vector-transpose-lowering=shuffle=1 -split-input-file | FileCheck %s --check-prefix=SHUFFLE +// RUN: mlir-opt %s -test-vector-transpose-lowering=flat=1 -split-input-file | FileCheck %s --check-prefix=FLAT +// RUN: mlir-opt %s -test-vector-transpose-lowering=avx2=1 -split-input-file | FileCheck %s --check-prefix=AVX2 // ELTWISE-LABEL: func @transpose23 // ELTWISE-SAME: %[[A:.*]]: vector<2x3xf32> @@ -24,6 +24,8 @@ return %0 : vector<3x2xf32> } +// ----- + // SHUFFLE-LABEL: func @transpose // FLAT-LABEL: func @transpose( func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> { @@ -42,6 +44,8 @@ return %0 : vector<4x2xf32> } +// ----- + // AVX2-LABEL: func @transpose4x8 func @transpose4x8xf32(%arg0: vector<4x8xf32>) -> vector<8x4xf32> { // AVX2: vector.extract {{.*}}[0] @@ -70,9 +74,49 @@ return %0 : vector<8x4xf32> } +// ----- + +// AVX2-LABEL: func @transpose021_1x4x8 +func @transpose021_1x4x8xf32(%arg0: vector<1x4x8xf32>) -> vector<1x8x4xf32> { + // AVX2: vector.extract {{.*}}[0, 0] + // AVX2-NEXT: vector.extract {{.*}}[0, 1] + // AVX2-NEXT: vector.extract {{.*}}[0, 2] + // AVX2-NEXT: vector.extract {{.*}}[0, 3] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 1, 8, 9, 4, 5, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 3, 10, 11, 6, 7, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.shape_cast {{.*}} vector<4x8xf32> to vector<32xf32> + // AVX2-NEXT: vector.shape_cast {{.*}} vector<32xf32> to vector<1x8x4xf32> + %0 = vector.transpose %arg0, [0, 2, 1] : vector<1x4x8xf32> to vector<1x8x4xf32> + return %0 : vector<1x8x4xf32> +} + +// ----- + // AVX2-LABEL: func @transpose8x8 func @transpose8x8xf32(%arg0: vector<8x8xf32>) -> vector<8x8xf32> { - // AVX2: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2: vector.extract {{.*}}[0] + // AVX2-NEXT: vector.extract {{.*}}[1] + // AVX2-NEXT: vector.extract {{.*}}[2] + // AVX2-NEXT: vector.extract {{.*}}[3] + // AVX2-NEXT: vector.extract {{.*}}[4] + // AVX2-NEXT: vector.extract {{.*}}[5] + // AVX2-NEXT: vector.extract {{.*}}[6] + // AVX2-NEXT: vector.extract {{.*}}[7] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> @@ -91,6 +135,475 @@ // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] %0 = vector.transpose %arg0, [1, 0] : vector<8x8xf32> to vector<8x8xf32> return %0 : vector<8x8xf32> } + +// ----- + +// AVX2-LABEL: func @transpose021_1x8x8 +func @transpose021_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<1x8x8xf32> { + // AVX2: vector.extract {{.*}}[0, 0] + // AVX2-NEXT: vector.extract {{.*}}[0, 1] + // AVX2-NEXT: vector.extract {{.*}}[0, 2] + // AVX2-NEXT: vector.extract {{.*}}[0, 3] + // AVX2-NEXT: vector.extract {{.*}}[0, 4] + // AVX2-NEXT: vector.extract {{.*}}[0, 5] + // AVX2-NEXT: vector.extract {{.*}}[0, 6] + // AVX2-NEXT: vector.extract {{.*}}[0, 7] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32> + %0 = vector.transpose %arg0, [0, 2, 1] : vector<1x8x8xf32> to vector<1x8x8xf32> + return %0 : vector<1x8x8xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose120_8x1x8 +func @transpose120_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<1x8x8xf32> { + // AVX2: vector.extract {{.*}}[0, 0] + // AVX2-NEXT: vector.extract {{.*}}[1, 0] + // AVX2-NEXT: vector.extract {{.*}}[2, 0] + // AVX2-NEXT: vector.extract {{.*}}[3, 0] + // AVX2-NEXT: vector.extract {{.*}}[4, 0] + // AVX2-NEXT: vector.extract {{.*}}[5, 0] + // AVX2-NEXT: vector.extract {{.*}}[6, 0] + // AVX2-NEXT: vector.extract {{.*}}[7, 0] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32> + %0 = vector.transpose %arg0, [1, 2, 0] : vector<8x1x8xf32> to vector<1x8x8xf32> + return %0 : vector<1x8x8xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose120_8x8x1 +func @transpose120_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<8x1x8xf32> { + // AVX2: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32> + // AVX2-NEXT: vector.extract {{.*}}[0] + // AVX2-NEXT: vector.extract {{.*}}[1] + // AVX2-NEXT: vector.extract {{.*}}[2] + // AVX2-NEXT: vector.extract {{.*}}[3] + // AVX2-NEXT: vector.extract {{.*}}[4] + // AVX2-NEXT: vector.extract {{.*}}[5] + // AVX2-NEXT: vector.extract {{.*}}[6] + // AVX2-NEXT: vector.extract {{.*}}[7] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32> + %0 = vector.transpose %arg0, [1, 2, 0] : vector<8x8x1xf32> to vector<8x1x8xf32> + return %0 : vector<8x1x8xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose102_8x8x1 +func @transpose102_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<8x8x1xf32> { + // AVX2: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32> + // AVX2-NEXT: vector.extract {{.*}}[0] + // AVX2-NEXT: vector.extract {{.*}}[1] + // AVX2-NEXT: vector.extract {{.*}}[2] + // AVX2-NEXT: vector.extract {{.*}}[3] + // AVX2-NEXT: vector.extract {{.*}}[4] + // AVX2-NEXT: vector.extract {{.*}}[5] + // AVX2-NEXT: vector.extract {{.*}}[6] + // AVX2-NEXT: vector.extract {{.*}}[7] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32> + %0 = vector.transpose %arg0, [1, 0, 2] : vector<8x8x1xf32> to vector<8x8x1xf32> + return %0 : vector<8x8x1xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose201_8x1x8 +func @transpose201_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<8x8x1xf32> { + // AVX2: vector.extract {{.*}}[0, 0] + // AVX2-NEXT: vector.extract {{.*}}[1, 0] + // AVX2-NEXT: vector.extract {{.*}}[2, 0] + // AVX2-NEXT: vector.extract {{.*}}[3, 0] + // AVX2-NEXT: vector.extract {{.*}}[4, 0] + // AVX2-NEXT: vector.extract {{.*}}[5, 0] + // AVX2-NEXT: vector.extract {{.*}}[6, 0] + // AVX2-NEXT: vector.extract {{.*}}[7, 0] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32> + %0 = vector.transpose %arg0, [2, 0, 1] : vector<8x1x8xf32> to vector<8x8x1xf32> + return %0 : vector<8x8x1xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose201_1x8x8 +func @transpose201_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x1x8xf32> { + // AVX2: vector.extract {{.*}}[0, 0] + // AVX2-NEXT: vector.extract {{.*}}[0, 1] + // AVX2-NEXT: vector.extract {{.*}}[0, 2] + // AVX2-NEXT: vector.extract {{.*}}[0, 3] + // AVX2-NEXT: vector.extract {{.*}}[0, 4] + // AVX2-NEXT: vector.extract {{.*}}[0, 5] + // AVX2-NEXT: vector.extract {{.*}}[0, 6] + // AVX2-NEXT: vector.extract {{.*}}[0, 7] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32> + %0 = vector.transpose %arg0, [2, 0, 1] : vector<1x8x8xf32> to vector<8x1x8xf32> + return %0 : vector<8x1x8xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose210_8x1x8 +func @transpose210_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<8x1x8xf32> { + // AVX2: vector.extract {{.*}}[0, 0] + // AVX2-NEXT: vector.extract {{.*}}[1, 0] + // AVX2-NEXT: vector.extract {{.*}}[2, 0] + // AVX2-NEXT: vector.extract {{.*}}[3, 0] + // AVX2-NEXT: vector.extract {{.*}}[4, 0] + // AVX2-NEXT: vector.extract {{.*}}[5, 0] + // AVX2-NEXT: vector.extract {{.*}}[6, 0] + // AVX2-NEXT: vector.extract {{.*}}[7, 0] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x1x8xf32> + %0 = vector.transpose %arg0, [2, 1, 0] : vector<8x1x8xf32> to vector<8x1x8xf32> + return %0 : vector<8x1x8xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose210_8x8x1 +func @transpose210_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<1x8x8xf32> { + // AVX2: vector.shape_cast %{{.*}} : vector<8x8x1xf32> to vector<8x8xf32> + // AVX2-NEXT: vector.extract {{.*}}[0] + // AVX2-NEXT: vector.extract {{.*}}[1] + // AVX2-NEXT: vector.extract {{.*}}[2] + // AVX2-NEXT: vector.extract {{.*}}[3] + // AVX2-NEXT: vector.extract {{.*}}[4] + // AVX2-NEXT: vector.extract {{.*}}[5] + // AVX2-NEXT: vector.extract {{.*}}[6] + // AVX2-NEXT: vector.extract {{.*}}[7] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<1x8x8xf32> + %0 = vector.transpose %arg0, [2, 1, 0] : vector<8x8x1xf32> to vector<1x8x8xf32> + return %0 : vector<1x8x8xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose210_1x8x8 +func @transpose210_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x8x1xf32> { + // AVX2: vector.extract {{.*}}[0, 0] + // AVX2-NEXT: vector.extract {{.*}}[0, 1] + // AVX2-NEXT: vector.extract {{.*}}[0, 2] + // AVX2-NEXT: vector.extract {{.*}}[0, 3] + // AVX2-NEXT: vector.extract {{.*}}[0, 4] + // AVX2-NEXT: vector.extract {{.*}}[0, 5] + // AVX2-NEXT: vector.extract {{.*}}[0, 6] + // AVX2-NEXT: vector.extract {{.*}}[0, 7] + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.shuffle {{.*}} [2, 10, 3, 11, 6, 14, 7, 15] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [2, 3, 8, 9, 6, 7, 12, 13] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-NEXT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0x33", "=x,x,x" {{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [0, 1, 2, 3, 8, 9, 10, 11] : vector<8xf32>, vector<8xf32> + // AVX2-COUNT-4: vector.shuffle {{.*}} [4, 5, 6, 7, 12, 13, 14, 15] : vector<8xf32>, vector<8xf32> + // AVX2-NEXT: vector.insert {{.*}}[0] + // AVX2-NEXT: vector.insert {{.*}}[1] + // AVX2-NEXT: vector.insert {{.*}}[2] + // AVX2-NEXT: vector.insert {{.*}}[3] + // AVX2-NEXT: vector.insert {{.*}}[4] + // AVX2-NEXT: vector.insert {{.*}}[5] + // AVX2-NEXT: vector.insert {{.*}}[6] + // AVX2-NEXT: vector.insert {{.*}}[7] + // AVX2-NEXT: vector.shape_cast %{{.*}} : vector<8x8xf32> to vector<8x8x1xf32> + %0 = vector.transpose %arg0, [2, 1, 0] : vector<1x8x8xf32> to vector<8x8x1xf32> + return %0 : vector<8x8x1xf32> +} + +// ----- + +// AVX2-LABEL: func @transpose021_8x1x8 +func @transpose021_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<8x8x1xf32> { + %0 = vector.transpose %arg0, [0, 2, 1] : vector<8x1x8xf32> to vector<8x8x1xf32> + return %0 : vector<8x8x1xf32> +} + +// AVX2-NOT: vector.shuffle + +// ----- + +// AVX2-LABEL: func @transpose021_8x8x1 +func @transpose021_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<8x1x8xf32> { + %0 = vector.transpose %arg0, [0, 2, 1] : vector<8x8x1xf32> to vector<8x1x8xf32> + return %0 : vector<8x1x8xf32> +} + +// AVX2-NOT: vector.shuffle + +// ----- + +// AVX2-LABEL: func @transpose102_1x8x8 +func @transpose102_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x1x8xf32> { + %0 = vector.transpose %arg0, [1, 0, 2] : vector<1x8x8xf32> to vector<8x1x8xf32> + return %0 : vector<8x1x8xf32> +} + +// AVX2-NOT: vector.shuffle + +// ----- + +// AVX2-LABEL: func @transpose102_8x1x8 +func @transpose102_8x1x8xf32(%arg0: vector<8x1x8xf32>) -> vector<1x8x8xf32> { + %0 = vector.transpose %arg0, [1, 0, 2] : vector<8x1x8xf32> to vector<1x8x8xf32> + return %0 : vector<1x8x8xf32> +} + +// AVX2-NOT: vector.shuffle + +// ----- + +// AVX2-LABEL: func @transpose120_1x8x8 +func @transpose120_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x8x1xf32> { + + %0 = vector.transpose %arg0, [1, 2, 0] : vector<1x8x8xf32> to vector<8x8x1xf32> + return %0 : vector<8x8x1xf32> +} + +// AVX2-NOT: vector.shuffle + +// ----- + +// AVX2-LABEL: func @transpose201_8x8x1 +func @transpose201_8x8x1xf32(%arg0: vector<8x8x1xf32>) -> vector<1x8x8xf32> { + %0 = vector.transpose %arg0, [2, 0, 1] : vector<8x8x1xf32> to vector<1x8x8xf32> + return %0 : vector<1x8x8xf32> +} + +// AVX2-NOT: vector.shuffle +