Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1463,6 +1463,11 @@ // series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in // the vector. If all are expanded here, we don't want one store per vector // element. + + // Caches for hasPredecessorHelper + SmallPtrSet Visited; + SmallVector Worklist; + SDValue StackPtr, Ch; for (SDNode::use_iterator UI = Vec.getNode()->use_begin(), UE = Vec.getNode()->use_end(); UI != UE; ++UI) { @@ -1477,6 +1482,12 @@ if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode())) continue; + // If the index is dependent on the store we will introduce a cycle when + // creating the load (the load uses the index, and by replacing the chain + // we will make the index dependent on the load). + if (Idx.getNode()->hasPredecessorHelper(ST, Visited, Worklist)) + continue; + StackPtr = ST->getBasePtr(); Ch = SDValue(ST, 0); break; Index: test/CodeGen/X86/extractelement-legalization-cycle.ll =================================================================== --- test/CodeGen/X86/extractelement-legalization-cycle.ll +++ test/CodeGen/X86/extractelement-legalization-cycle.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; When the extractelement is converted to a load the store can be re-used. +; This will, however, introduce a cycle into the selection DAG (the load +; of the extractelement index is dependent on the store, and so after the +; conversion it becomes dependent on the new load, which is dependent on +; the index). Make sure we skip the store, and conservatively instead +; use a store to the stack. + +define float @foo(i32* %i, <4 x float>* %v) { +; CHECK-LABEL: foo: +; CHECK: movaps %xmm0, -[[OFFSET:[0-9]+]](%rsp) +; CHECK: movss -[[OFFSET]](%rsp,{{.*}}), %xmm0 {{.*}} +; CHECK-NEXT: retq + %1 = load <4 x float>, <4 x float>* %v, align 16 + %mul = fmul <4 x float> %1, %1 + store <4 x float> %mul, <4 x float>* %v, align 16 + %2 = load i32, i32* %i, align 4 + %vecext = extractelement <4 x float> %mul, i32 %2 + ret float %vecext +}