Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12236,6 +12236,8 @@ unsigned NumUndefLanes = 0; SDValue Value; SDValue ConstantValue; + SmallPtrSet DifferentValueSet; + SmallVector MaskVec; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) @@ -12263,6 +12265,14 @@ usesOnlyOneValue = false; ++NumDifferentLanes; } + + // Keep different values on vector. + DifferentValueSet.insert(V.getNode()); + // The vector mask will be valid only if the DifferentValueSet's size is 2. + if (V == Value) + MaskVec.push_back(0); + else + MaskVec.push_back(NumElts); } if (!Value.getNode()) { @@ -12468,6 +12478,21 @@ return NewVector; } + // If vector consists of two different values, generate two DUPs and + // VECTOR_SHUFFLE. + if (DifferentValueSet.size() == 2 && NumUndefLanes == 0 && NumElts >= 4) { + SmallVector Vals; + for (auto *Val : DifferentValueSet) { + Vals.push_back(SDValue(Val, 0)); + } + SmallVector Ops1(NumElts, Vals[0]); + SmallVector Ops2(NumElts, Vals[1]); + SDValue DUP1 = DAG.getBuildVector(VT, dl, Ops1); + SDValue DUP2 = DAG.getBuildVector(VT, dl, Ops2); + SDValue VECTORSHUFFLE = DAG.getVectorShuffle(VT, dl, DUP1, DUP2, MaskVec); + return VECTORSHUFFLE; + } + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's Index: llvm/test/CodeGen/AArch64/arm64-dup.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-dup.ll +++ llvm/test/CodeGen/AArch64/arm64-dup.ll @@ -334,10 +334,11 @@ define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone { ; CHECK-LABEL: g: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: mov.s v0[1], w1 -; CHECK-NEXT: mov.s v0[2], w1 -; CHECK-NEXT: mov.s v0[3], w0 +; CHECK-NEXT: dup.4s v0, w1 +; CHECK-NEXT: dup.4s v1, w0 +; CHECK-NEXT: uzp1.4s v0, v1, v0 +; CHECK-NEXT: ext.16b v1, v0, v1, #8 +; CHECK-NEXT: uzp1.4s v0, v0, v1 ; CHECK-NEXT: ret %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1 Index: llvm/test/CodeGen/AArch64/build-vector-two-dup.ll =================================================================== --- llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -4,24 +4,11 @@ define <16 x i8> @test1(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ld1r { v1.16b }, [x1] +; CHECK-NEXT: ld1r { v0.16b }, [x0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -75,24 +62,11 @@ define <16 x i8> @test4(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.b[1], w8 -; CHECK-NEXT: mov v0.b[2], w8 -; CHECK-NEXT: mov v0.b[3], w8 -; CHECK-NEXT: mov v0.b[4], w8 -; CHECK-NEXT: mov v0.b[5], w8 -; CHECK-NEXT: mov v0.b[6], w8 -; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov v0.b[8], w8 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 -; CHECK-NEXT: mov v0.b[14], w8 -; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ld1r { v1.16b }, [x0] +; CHECK-NEXT: ld1r { v0.16b }, [x1] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 Index: llvm/test/CodeGen/AArch64/concat_vector-scalar-combine.ll =================================================================== --- llvm/test/CodeGen/AArch64/concat_vector-scalar-combine.ll +++ llvm/test/CodeGen/AArch64/concat_vector-scalar-combine.ll @@ -65,10 +65,11 @@ define <8 x i16> @test_concat_scalars_2x_v2i16_to_v8i16_dup(i32 %x, i32 %y) #0 { entry: ; CHECK-LABEL: test_concat_scalars_2x_v2i16_to_v8i16_dup: -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: mov.s v0[1], w1 -; CHECK-NEXT: mov.s v0[2], w1 -; CHECK-NEXT: mov.s v0[3], w0 +; CHECK-NEXT: dup.4s v0, w1 +; CHECK-NEXT: dup.4s v1, w0 +; CHECK-NEXT: uzp1.4s v0, v1, v0 +; CHECK-NEXT: ext.16b v1, v0, v1, #8 +; CHECK-NEXT: uzp1.4s v0, v0, v1 ; CHECK-NEXT: ret %bx = bitcast i32 %x to <2 x i16> %by = bitcast i32 %y to <2 x i16> @@ -83,11 +84,11 @@ define <8 x i8> @test_concat_scalars_mixed_2x_v2i8_to_v8i8(float %dummy, i32 %x, half %y) #0 { entry: ; CHECK-LABEL: test_concat_scalars_mixed_2x_v2i8_to_v8i8: -; CHECK-NEXT: fmov s[[X:[0-9]+]], w0 -; CHECK-NEXT: mov.16b v0, v[[X]] -; CHECK-NEXT: mov.h v0[1], v1[0] -; CHECK-NEXT: mov.h v0[2], v[[X]][0] -; CHECK-NEXT: mov.h v0[3], v1[0] +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: dup.4h v1, v1[0] +; CHECK-NEXT: dup.4h v0, v0[0] +; CHECK-NEXT: uzp1.4h v0, v0, v1 +; CHECK-NEXT: uzp1.4h v0, v0, v0 ; CHECK-NEXT: ret %t = trunc i32 %x to i16 %0 = bitcast i16 %t to <2 x i8> @@ -99,10 +100,10 @@ define <2 x float> @test_concat_scalars_fp_2x_v2i8_to_v8i8(float %dummy, half %x, half %y) #0 { entry: ; CHECK-LABEL: test_concat_scalars_fp_2x_v2i8_to_v8i8: -; CHECK-NEXT: mov.16b v0, v1 -; CHECK-NEXT: mov.h v0[1], v2[0] -; CHECK-NEXT: mov.h v0[2], v1[0] -; CHECK-NEXT: mov.h v0[3], v2[0] +; CHECK-NEXT: dup.4h v0, v2[0] +; CHECK-NEXT: dup.4h v1, v1[0] +; CHECK-NEXT: uzp1.4h v0, v1, v0 +; CHECK-NEXT: uzp1.4h v0, v0, v0 ; CHECK-NEXT: ret %0 = bitcast half %x to <2 x i8> %y0 = bitcast half %y to <2 x i8>