Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5677,6 +5677,25 @@ } } + // Try to fold extract_vector_elt of a load into the load's address + // computation. + if (Ld.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + ISD::isNormalLoad(Ld.getOperand(0).getNode())) { + auto Size = Ld.getSimpleValueType().getStoreSize(); + SDValue RealLoad = Ld.getOperand(0); + SDValue Index = Ld.getOperand(1); + SDValue Addr = RealLoad.getOperand(1); + SDValue Offset = + DAG.getNode(ISD::MUL, dl, Index.getSimpleValueType(), Index, + DAG.getConstant(Size, Index.getSimpleValueType())); + SDValue NewAddr = + DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr, Offset); + LoadSDNode *Load = cast(RealLoad); + Ld = DAG.getLoad(Ld.getSimpleValueType(), dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, Size)); + } + bool IsLoad = ISD::isNormalLoad(Ld.getNode()); unsigned ScalarSize = Ld.getValueType().getSizeInBits(); Index: test/CodeGen/X86/vec_splat.ll =================================================================== --- test/CodeGen/X86/vec_splat.ll +++ test/CodeGen/X86/vec_splat.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2 ; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3 +; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind { %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] @@ -32,3 +33,20 @@ ; SSE3-LABEL: test_v2sd: ; SSE3: movddup } + +; Fold extract of a load into the load's address computation. This avoids spilling to the stack. +define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind { + %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i + %2 = load <4 x float>* %1, align 16 + %3 = trunc i64 %j to i32 + %4 = extractelement <4 x float> %2, i32 %3 + %5 = insertelement <4 x float> undef, float %4, i32 0 + %6 = insertelement <4 x float> %5, float %4, i32 1 + %7 = insertelement <4 x float> %6, float %4, i32 2 + %8 = insertelement <4 x float> %7, float %4, i32 3 + ret <4 x float> %8 + +; AVX-LABEL: load_extract_splat +; AVX-NOT: rsp +; AVX: vbroadcastss +}