Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11985,6 +11985,30 @@ (N1.getOpcode() == ISD::CONCAT_VECTORS && N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) { + // Try to avoid smaller vectors by combining a shuffle of one + // CONCAT_VECTORS coming from a scalar to SCALAR_TO_VECTOR instead: + // (vector_shuffle (v8i8 concat_vectors (v2i8 bitcast (i16)), undef..), M) + // -> + // (vector_shuffle (v8i8 bitcast (v4i16 scalar_to_vector (i16))), M) + if (N1.getOpcode() == ISD::UNDEF && + N0->getOperand(0)->getOpcode() == ISD::BITCAST && + std::all_of( + N0->ops().begin() + 1, N0->ops().end(), + [](const SDValue &Op) { return Op->getOpcode() == ISD::UNDEF; })) { + SDValue Scalar = N0->getOperand(0)->getOperand(0); + EVT ScVT = Scalar.getValueType(); + if (!ScVT.isVector()) { + SDLoc dl(N); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), ScVT, + VT.getSizeInBits() / ScVT.getSizeInBits()); + return DAG.getVectorShuffle( + VT, dl, + DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Scalar)), + N1, SVN->getMask()); + } + } + // Try to simplify either the shuffle or the concats. if (SDValue V = partitionShuffleOfConcats(N, DAG)) return V; Index: test/CodeGen/AArch64/concat_vectors-combines.ll =================================================================== --- test/CodeGen/AArch64/concat_vectors-combines.ll +++ test/CodeGen/AArch64/concat_vectors-combines.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple arm64-apple-darwin -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-collect-loh=false -asm-verbose=false | FileCheck %s +; LOHs are annoying, disable them. target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" @@ -53,4 +54,63 @@ ret <8 x i8> %1 } +; Test the (vector_shuffle (concat_vectors (bitcast (scalar)), undef..), undef, ) pattern. + +; FIXME: This should use DUP. +define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup: +; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0 +; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0] +; CHECK-NEXT: movi.4h v[[V1:[0-9]+]], #0x1, lsl #8 +; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]] +; CHECK-NEXT: ret + %t = trunc i32 %x to i16 + %0 = bitcast i16 %t to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> + ret <8 x i8> %1 +} + +define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8(i32 %x) #0 { +entry: +; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8: +; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}} +; CHECK-NEXT: ldr d[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}} +; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0 +; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0] +; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]] +; CHECK-NEXT: ret + %t = trunc i32 %x to i16 + %0 = bitcast i16 %t to <2 x i8> + %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> + ret <8 x i8> %1 +} + +define <8 x i8> @test_shuffle_from_concat_scalar_v4i8_to_v8i8(i32 %x) #0 { +entry: +; CHECK-LABEL: test_shuffle_from_concat_scalar_v4i8_to_v8i8: +; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}} +; CHECK-NEXT: ldr d[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}} +; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0 +; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0] +; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]] +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <4 x i8> + %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> + ret <8 x i8> %1 +} + +define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16(i32 %x) #0 { +entry: +; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16: +; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}} +; CHECK-NEXT: ldr q[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}} +; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0 +; CHECK-NEXT: tbl.16b v0, { v[[V0]] }, v[[V1]] +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> + ret <8 x i16> %1 +} + attributes #0 = { nounwind }