Index: llvm/trunk/include/llvm/CodeGen/MachineInstr.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/MachineInstr.h +++ llvm/trunk/include/llvm/CodeGen/MachineInstr.h @@ -1100,6 +1100,9 @@ /// bool hasUnmodeledSideEffects() const; + /// Returns true if it is illegal to fold a load across this instruction. + bool isLoadFoldBarrier() const; + /// Return true if all the defs of this instruction are dead. bool allDefsAreDead() const; Index: llvm/trunk/lib/CodeGen/MachineInstr.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachineInstr.cpp +++ llvm/trunk/lib/CodeGen/MachineInstr.cpp @@ -1503,6 +1503,10 @@ return false; } +bool MachineInstr::isLoadFoldBarrier() const { + return mayStore() || isCall() || hasUnmodeledSideEffects(); +} + /// allDefsAreDead - Return true if all the defs of this instruction are dead. /// bool MachineInstr::allDefsAreDead() const { Index: llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp +++ llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp @@ -1234,9 +1234,9 @@ if (MI->isDebugValue()) continue; - // If there exists an instruction which belongs to the following - // categories, we will discard the load candidates. - if (MI->mayStore() || MI->isCall() || MI->hasUnmodeledSideEffects()) + // If we run into an instruction we can't fold across, discard + // the load candidates. + if (MI->isLoadFoldBarrier()) FoldAsLoadDefCandidates.clear(); if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || Index: llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp +++ llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp @@ -528,13 +528,10 @@ DefMI->getParent() != FrameSetup->getParent()) return nullptr; - // Now, make sure everything else up until the ADJCALLSTACK is a sequence - // of MOVs. To be less conservative would require duplicating a lot of the - // logic from PeepholeOptimizer. - // FIXME: A possibly better approach would be to teach the PeepholeOptimizer - // to be smarter about folding into pushes. + // Make sure we don't have any instructions between DefMI and the + // push that make folding the load illegal. for (auto I = DefMI; I != FrameSetup; ++I) - if (I->getOpcode() != X86::MOV32rm) + if (I->isLoadFoldBarrier()) return nullptr; return DefMI; Index: llvm/trunk/test/CodeGen/X86/movtopush.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/movtopush.ll +++ llvm/trunk/test/CodeGen/X86/movtopush.ll @@ -357,3 +357,26 @@ call void @good(i32 9, i32 10, i32 11, i32 12) ret void } + +; Make sure the add does not prevent folding loads into pushes. +; val1 and val2 will not be folded into pushes since they have +; an additional use, but val3 should be. +; NORMAL-LABEL: test13: +; NORMAL: movl ([[P1:%e..]]), [[V1:%e..]] +; NORMAL-NEXT: movl ([[P2:%e..]]), [[V2:%e..]] +; NORMAL-NEXT: , [[ADD:%e..]] +; NORMAL-NEXT: pushl [[ADD]] +; NORMAL-NEXT: pushl ([[P3:%e..]]) +; NORMAL-NEXT: pushl [[V2]] +; NORMAL-NEXT: pushl [[V1]] +; NORMAL-NEXT: calll _good +; NORMAL: movl [[P3]], %eax +define i32* @test13(i32* inreg %ptr1, i32* inreg %ptr2, i32* inreg %ptr3) optsize { +entry: + %val1 = load i32, i32* %ptr1 + %val2 = load i32, i32* %ptr2 + %val3 = load i32, i32* %ptr3 + %add = add i32 %val1, %val2 + call void @good(i32 %val1, i32 %val2, i32 %val3, i32 %add) + ret i32* %ptr3 +}