Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12051,9 +12051,9 @@ } } - // If this shuffle only has a single input that is a bitcasted shuffle, - // attempt to merge the 2 shuffles and suitably bitcast the inputs/output - // back to their original types. + // If this shuffle only has a single input that is either a bitcast shuffle or + // scalar_to_vector, attempt to merge the 2 nodes and suitably bitcast the + // inputs/output back to their original types. if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && N1.getOpcode() == ISD::UNDEF && Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { @@ -12066,6 +12066,44 @@ BC0 = BC0.getOperand(0); } + EVT SVT = VT.getScalarType(); + EVT InnerVT = BC0->getValueType(0); + EVT InnerSVT = InnerVT.getScalarType(); + + // If the shuffle scalar type is smaller than the scalar_to_vector input + // type, try to express the shuffle in terms of the bigger scalar. + // This lets us recognize special shuffle patterns more easily. + if (BC0.getOpcode() == ISD::SCALAR_TO_VECTOR && BC0.hasOneUse() && + InnerSVT.bitsGT(SVT) && TLI.isTypeLegal(InnerVT) && + 0 == (InnerSVT.getSizeInBits() % SVT.getSizeInBits())) { + + int Scale = InnerSVT.getSizeInBits() / SVT.getSizeInBits(); + + // Look for either the repetition of the scalar, or undefs. + bool CanFold = true; + SmallVector NewMask; + for (int i = 0; i != NumElts; i += Scale) { + bool UndefRun = false; + for (int j = 0; j != Scale; ++j) { + int Idx = SVN->getMaskElt(i + j); + if (Idx < 0) + UndefRun = true; + else if (UndefRun || Idx != j) + CanFold = false; + } + if (!CanFold) + break; + NewMask.push_back(UndefRun ? -1 : 0); + } + + if (CanFold && TLI.isShuffleMaskLegal(NewMask, InnerVT)) { + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, + DAG.getVectorShuffle(InnerVT, SDLoc(N), BC0, + DAG.getUNDEF(InnerVT), + NewMask)); + } + } + auto ScaleShuffleMask = [](ArrayRef Mask, int Scale) { if (Scale == 1) return SmallVector(Mask.begin(), Mask.end()); @@ -12078,10 +12116,6 @@ }; if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { - EVT SVT = VT.getScalarType(); - EVT InnerVT = BC0->getValueType(0); - EVT InnerSVT = InnerVT.getScalarType(); - // Determine which shuffle works with the smaller scalar type. EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT; EVT ScaleSVT = ScaleVT.getScalarType(); Index: test/CodeGen/AArch64/concat_vectors-combines.ll =================================================================== --- test/CodeGen/AArch64/concat_vectors-combines.ll +++ test/CodeGen/AArch64/concat_vectors-combines.ll @@ -56,14 +56,10 @@ ; Test the (vector_shuffle (concat_vectors (bitcast (scalar)), undef..), undef, ) pattern. -; FIXME: This should use DUP. define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 { entry: ; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup: -; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0 -; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0] -; CHECK-NEXT: movi.4h v[[V1:[0-9]+]], #0x1, lsl #8 -; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]] +; CHECK-NEXT: dup.4h v0, w0 ; CHECK-NEXT: ret %t = trunc i32 %x to i16 %0 = bitcast i16 %t to <2 x i8> @@ -71,6 +67,29 @@ ret <8 x i8> %1 } +define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup: +; CHECK-NEXT: dup.4s v0, w0 +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> + ret <8 x i16> %1 +} + +define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid(i32 %x) #0 { +entry: +; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid: +; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}} +; CHECK-NEXT: ldr q[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}} +; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0 +; CHECK-NEXT: tbl.16b v0, { v[[V0]] }, v[[V1]] +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> + ret <8 x i16> %1 +} + define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8(i32 %x) #0 { entry: ; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8: