diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16966,6 +16966,15 @@ } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); + // This will be just movd/movq/movss/movsd + if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) { + if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || + (EltVT == MVT::i64 && Subtarget.is64Bit())) { + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); + } + } + // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. SSE41 required for pinsrb. if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { diff --git a/llvm/test/CodeGen/X86/vec_insert_first.ll b/llvm/test/CodeGen/X86/vec_insert_first.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_insert_first.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s + +define dso_local void @_Z4LoopPKiRDv2_xjj(i32* nocapture readonly %d, <2 x i64>* nocapture dereferenceable(16) %res, i32 %st1, i32 %st2) local_unnamed_addr #0 { + +; CHECK-NOT: pinsr +entry: + %idxprom.i = zext i32 %st1 to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %d, i64 %idxprom.i + %idxprom1.i = zext i32 %st2 to i64 + %arrayidx2.i = getelementptr inbounds i32, i32* %d, i64 %idxprom1.i + %0 = load i32, i32* %arrayidx.i, align 4 + %vecinit3.i.i = insertelement <4 x i32> , i32 %0, i32 0 + %1 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64> + %2 = load i32, i32* %arrayidx2.i, align 4 + %vecinit3.i10.i = insertelement <4 x i32> , i32 %2, i32 0 + %3 = bitcast <4 x i32> %vecinit3.i10.i to <2 x i64> + %4 = load <2 x i64>, <2 x i64>* %res, align 16 + %shuffle.i.i = shufflevector <2 x i64> %1, <2 x i64> %3, <2 x i32> + %add.i.i = add <2 x i64> %shuffle.i.i, %4 + store <2 x i64> %add.i.i, <2 x i64>* %res, align 16 + %5 = load i32, i32* %arrayidx.i, align 4 + %vecinit3.i.i.1 = insertelement <4 x i32> , i32 %5, i32 0 + %6 = bitcast <4 x i32> %vecinit3.i.i.1 to <2 x i64> + %7 = load i32, i32* %arrayidx2.i, align 4 + %vecinit3.i10.i.1 = insertelement <4 x i32> , i32 %7, i32 0 + %8 = bitcast <4 x i32> %vecinit3.i10.i.1 to <2 x i64> + %shuffle.i.i.1 = shufflevector <2 x i64> %6, <2 x i64> %8, <2 x i32> + %add.i.i.1 = add <2 x i64> %shuffle.i.i.1, %add.i.i + store <2 x i64> %add.i.i.1, <2 x i64>* %res, align 16 + %9 = load i32, i32* %arrayidx.i, align 4 + %vecinit3.i.i.2 = insertelement <4 x i32> , i32 %9, i32 0 + %10 = bitcast <4 x i32> %vecinit3.i.i.2 to <2 x i64> + %11 = load i32, i32* %arrayidx2.i, align 4 + %vecinit3.i10.i.2 = insertelement <4 x i32> , i32 %11, i32 0 + %12 = bitcast <4 x i32> %vecinit3.i10.i.2 to <2 x i64> + %shuffle.i.i.2 = shufflevector <2 x i64> %10, <2 x i64> %12, <2 x i32> + %add.i.i.2 = add <2 x i64> %shuffle.i.i.2, %add.i.i.1 + store <2 x i64> %add.i.i.2, <2 x i64>* %res, align 16 + %13 = load i32, i32* %arrayidx.i, align 4 + %vecinit3.i.i.3 = insertelement <4 x i32> , i32 %13, i32 0 + %14 = bitcast <4 x i32> %vecinit3.i.i.3 to <2 x i64> + %15 = load i32, i32* %arrayidx2.i, align 4 + %vecinit3.i10.i.3 = insertelement <4 x i32> , i32 %15, i32 0 + %16 = bitcast <4 x i32> %vecinit3.i10.i.3 to <2 x i64> + %shuffle.i.i.3 = shufflevector <2 x i64> %14, <2 x i64> %16, <2 x i32> + %add.i.i.3 = add <2 x i64> %shuffle.i.i.3, %add.i.i.2 + store <2 x i64> %add.i.i.3, <2 x i64>* %res, align 16 + ret void +}