Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -24695,6 +24695,8 @@ LoadSDNode *Ld = cast(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); + SDValue Ptr = Ld->getBasePtr(); + SDValue Chain = Ld->getChain(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -24733,6 +24735,33 @@ return DCI.CombineTo(N, NewVec, TF, true); } + // Convertion from x86mmx/i64 to v2i64 types is often done via stack + // store/load. Under certain conditions we can bypass the memory access and + // combine this load to use a scalar_to_vector instead. This leads to + // a reduction in the stack use, redudant emission of shuffles and create + // isel matching candidates for movq2dq instructions. + if (RegVT == MVT::v2i64 && Subtarget->hasSSE2() && Ext == ISD::EXTLOAD && + !Ld->isVolatile() && ISD::isNON_TRUNCStore(Chain.getNode())) { + + // If this load is directly stored, get the original source value + StoreSDNode *PrevST = cast(Chain); + EVT SrcTy = PrevST->getValue().getValueType(); + if (PrevST->getBasePtr() != Ptr || + !(SrcTy == MVT::i64 || SrcTy == MVT::x86mmx)) + return SDValue(); + SDValue SrcVal = Chain.getOperand(1); + + // On 32bit systems, we can't save 64bit integers, use f64 instead + bool Usef64 = TLI.isTypeLegal(MVT::f64) && !Subtarget->is64Bit(); + if (Usef64) + SrcVal = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SrcVal); + SrcVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, Usef64 ? MVT::v2f64 : RegVT, + SrcVal); + + return DCI.CombineTo(N, Usef64 ? + DAG.getNode(ISD::BITCAST, dl, RegVT, SrcVal) : SrcVal, Chain); + } + return SDValue(); } Index: test/CodeGen/X86/2012-01-18-vbitcast.ll =================================================================== --- test/CodeGen/X86/2012-01-18-vbitcast.ll +++ test/CodeGen/X86/2012-01-18-vbitcast.ll @@ -2,12 +2,13 @@ ;CHECK-LABEL: vcast: define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) { -;CHECK: pmovzxdq -;CHECK: pmovzxdq +;CHECK-NOT: pmovzxdq +;CHECK-NOT: pmovzxdq +;CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] %af = bitcast <2 x float> %a to <2 x i32> %bf = bitcast <2 x float> %b to <2 x i32> %x = sub <2 x i32> %af, %bf -;CHECK: psubq +;CHECK-NEXT: psubq (%{{.*}}), %[[R0]] ret <2 x i32> %x ;CHECK: ret } Index: test/CodeGen/X86/lower-bitcast.ll =================================================================== --- test/CodeGen/X86/lower-bitcast.ll +++ test/CodeGen/X86/lower-bitcast.ll @@ -68,12 +68,13 @@ %2 = bitcast <2 x i32> %add to i64 ret i64 %2 } -; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd. +; FIXME: At the moment we still produce the sequence paddd+pshufd. ; Ideally, we should fold that sequence into a single paddd. This is fixed with ; the widening legalization. ; ; CHECK-LABEL: test4 -; CHECK: pshufd +; CHECK: movd +; CHECK-NOT: pshufd ; CHECK-NEXT: paddd ; CHECK-NEXT: pshufd ; CHECK: ret Index: test/CodeGen/X86/mmx-movq2dq.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/mmx-movq2dq.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32 +; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-64 + +; X86-32-LABEL: test0 +; X86-64-LABEL: test0 +define i32 @test0(<1 x i64>* %v4) { + %v5 = load <1 x i64>* %v4, align 8 + %v12 = bitcast <1 x i64> %v5 to <4 x i16> + %v13 = bitcast <4 x i16> %v12 to x86_mmx + ; X86-32: pshufw $-18 + ; X86-32-NOT: movq + ; X86-32-NOT: movsd + ; X86-32: movq2dq + ; X86-64: pshufw $-18 + ; X86-64-NOT: movq + ; X86-64-NOT: pshufd + ; X86-64: movq2dq + ; X86-64-NEXT: movd + %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18) + %v15 = bitcast x86_mmx %v14 to <4 x i16> + %v16 = bitcast <4 x i16> %v15 to <1 x i64> + %v17 = extractelement <1 x i64> %v16, i32 0 + %v18 = bitcast i64 %v17 to <2 x i32> + %v19 = extractelement <2 x i32> %v18, i32 0 + %v20 = add i32 %v19, 32 + ret i32 %v20 +} + +declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -78,8 +78,7 @@ ; CHECK-NEXT: paddd %[[R0]], %[[R1]] ; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}}) ; CHECK-NEXT: pshufb {{.*}}, %[[R1]] -; CHECK-NEXT: pmovzxdq %[[R1]], %[[R0]] -; CHECK-NEXT: movd %[[R0]], (%{{.*}}) +; CHECK-NEXT: movd %[[R1]], (%{{.*}}) %a = load %i16vec3* %ap, align 16 %b = load %i16vec3* %bp, align 16 %x = add %i16vec3 %a, %b