Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11861,17 +11861,55 @@ static SDValue combineShuffleOfScalarInputs(SDNode *N, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT ResVT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ShuffleVectorSDNode *SVN = cast(N); + // If we only have one input, peek through bitcasts only if there is one user. + // FIXME: Is it useful to look at bitcasts on both sides? + if (N1.getOpcode() == ISD::UNDEF) + while (N0.getOpcode() == ISD::BITCAST) { + if (!N0.hasOneUse()) + break; + N0 = N0.getOperand(0); + } + + // The bitcast source needs to have an element size that's a multiple of + // the shuffle element size. If it doesn't, revert to the shuffle operand. + if (!N0.getValueType().isVector() || + (N0.getValueType().getScalarSizeInBits() % ResVT.getScalarSizeInBits())) + N0 = N->getOperand(0); + EVT VT = N0.getValueType(); EVT SVT = VT.getScalarType(); const unsigned NumElts = VT.getVectorNumElements(); + const unsigned ResNumElts = ResVT.getVectorNumElements(); + + const int Scale = SVT.getSizeInBits() / ResVT.getScalarSizeInBits(); + assert((ResNumElts % Scale) == 0); SmallVector Ops; - for (int M : SVN->getMask()) { + for (unsigned i = 0; i != ResNumElts; i += Scale) { SDValue Op = DAG.getUNDEF(SVT); + int M = SVN->getMaskElt(i); + + if (Scale > 1) { + // Normalize undef indices. + if (M < 0) + M = -Scale; + if (M % Scale) + return SDValue(); + // Make sure these are either all undef or consecutive elements. + for (int j = 0; j != Scale; ++j) { + int InnerM = SVN->getMaskElt(i + j); + if (((InnerM < 0) != (M < 0)) || + (InnerM >= 0 && InnerM != M + (int)j)) + return SDValue(); + } + M /= Scale; + } + if (M >= 0) { int Idx = M % NumElts; SDValue &S = (M < (int)NumElts ? N0 : N1); @@ -11897,7 +11935,8 @@ Op = TLI.isZExtFree(Op.getValueType(), SVT) ? DAG.getZExtOrTrunc(Op, SDLoc(N), SVT) : DAG.getSExtOrTrunc(Op, SDLoc(N), SVT); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Ops); + return DAG.getNode(ISD::BITCAST, SDLoc(N), ResVT, + DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Ops)); } SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { @@ -12055,7 +12094,7 @@ return V; } - // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - + // Attempt to combine a shuffle of 1 or 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { if (SDValue V = combineShuffleOfScalarInputs(N, DAG)) Index: test/CodeGen/AArch64/concat_vectors-combines.ll =================================================================== --- test/CodeGen/AArch64/concat_vectors-combines.ll +++ test/CodeGen/AArch64/concat_vectors-combines.ll @@ -56,14 +56,10 @@ ; Test the (vector_shuffle (concat_vectors (bitcast (scalar)), undef..), undef, ) pattern. -; FIXME: This should use DUP. define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 { entry: ; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8_dup: -; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0 -; CHECK-NEXT: ins.d v[[V0]][1], v[[V0]][0] -; CHECK-NEXT: movi.4h v[[V1:[0-9]+]], #0x1, lsl #8 -; CHECK-NEXT: tbl.8b v0, { v[[V0]] }, v[[V1]] +; CHECK-NEXT: dup.4h v0, w0 ; CHECK-NEXT: ret %t = trunc i32 %x to i16 %0 = bitcast i16 %t to <2 x i8> @@ -71,6 +67,29 @@ ret <8 x i8> %1 } +define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 { +entry: +; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_dup: +; CHECK-NEXT: dup.4s v0, w0 +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> + ret <8 x i16> %1 +} + +define <8 x i16> @test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid(i32 %x) #0 { +entry: +; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i16_to_v8i16_duplike_invalid: +; CHECK-NEXT: adrp x[[MASKPTR:[0-9]+]], lCPI{{.*}} +; CHECK-NEXT: ldr q[[V1:[0-9]+]], [x[[MASKPTR]], lCPI{{.*}} +; CHECK-NEXT: fmov s[[V0:[0-9]+]], w0 +; CHECK-NEXT: tbl.16b v0, { v[[V0]] }, v[[V1]] +; CHECK-NEXT: ret + %0 = bitcast i32 %x to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> + ret <8 x i16> %1 +} + define <8 x i8> @test_shuffle_from_concat_scalar_v2i8_to_v8i8(i32 %x) #0 { entry: ; CHECK-LABEL: test_shuffle_from_concat_scalar_v2i8_to_v8i8: Index: test/CodeGen/ARM/vector-DAGCombine.ll =================================================================== --- test/CodeGen/ARM/vector-DAGCombine.ll +++ test/CodeGen/ARM/vector-DAGCombine.ll @@ -48,7 +48,8 @@ %2 = bitcast double %1 to i64 %3 = insertelement <1 x i64> undef, i64 %2, i32 0 ; CHECK-NOT: vmov s -; CHECK: vext.8 +; CHECK: vmov r0, r1, d +; CHECK: vmov r2, r3, d %4 = shufflevector <1 x i64> %3, <1 x i64> undef, <2 x i32> %tmp2006.3 = bitcast <2 x i64> %4 to <16 x i8> %5 = shufflevector <16 x i8> %tmp2006.3, <16 x i8> undef, <16 x i32>