diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -117,6 +117,7 @@ bool emitRLDICWhenLoweringJumpTables(MachineInstr &MI); void UpdateTOCSaves(std::map &TOCSaves, MachineInstr *MI); + bool optimizeLoad(MachineInstr &MI); public: @@ -277,6 +278,116 @@ TOCSaves[MI] = Keep; } +// Try to eliminate the load which can be represented by previous store content. +// For example, for byval parameter, PPC will generate some loads to load them +// from caller's stack frame, if there is a store to the same stack location, +// this may cause load hit store issue. Because the load is generated in ISEL, +// mem2reg has no chance to optimize this pattern. +bool PPCMIPeephole::optimizeLoad(MachineInstr &MI) { + // For now, only enable for pre RA. + if (!MRI->isSSA()) + return false; + + if (MI.isInlineAsm()) + return false; + + if (!MI.hasOneMemOperand() || !MI.memoperands()[0]->isLoad()) + return false; + + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getNumDefs() != 1) + return false; + + bool SawStore = false; + if (!MI.isSafeToMove(nullptr, SawStore)) + return false; + + auto IsValidStore = [&](MachineInstr &StoreMI) { + if (StoreMI.isInlineAsm()) + return false; + + if (!StoreMI.hasOneMemOperand() || !StoreMI.memoperands()[0]->isStore()) + return false; + + unsigned StoreDefNum = StoreMI.getNumExplicitDefs(); + unsigned StoreOpNum = StoreMI.getNumExplicitOperands(); + + // Store explicit use operands should be 1 bigger than load explicit use + // operands. The extra 1 use operand is the value stored to the memory. + if (StoreOpNum - StoreDefNum != MI.getNumExplicitOperands()) + return false; + + if (!StoreMI.explicit_operands().begin()->isReg()) + return false; + + Register DestReg = MI.getOperand(0).getReg(); + Register SrcReg = StoreMI.explicit_operands().begin()->getReg(); + + // The content stored to memory should be same type with the load + // instruciton's single def. + if (MRI->getRegClass(DestReg) != MRI->getRegClass(SrcReg)) + return false; + + if (StoreMI.memoperands()[0]->getMemoryType().getSizeInBytes() != + MI.memoperands()[0]->getMemoryType().getSizeInBytes()) + return false; + + // The extention type must also be the same. + if (TII->isSignOrZeroExtended(SrcReg, 0, MRI) != + TII->isSignOrZeroExtended(DestReg, 0, MRI)) + return false; + + // Each operand for the memory location of the store is same with the load. + // Ignore the first one which is for the value stored to the memory. + for (unsigned UseOp = StoreDefNum + 1; UseOp < StoreOpNum; UseOp++) + if (!StoreMI.getOperand(UseOp).isIdenticalTo( + MI.getOperand(UseOp - StoreDefNum))) + return false; + + return true; + }; + + // To reduce complexity, only check the possible store in same block. This is + // enough for handling the motivated load hit store issue. + MachineBasicBlock::reverse_iterator E = MI.getParent()->rend(), It = MI; + It++; + MachineInstr *Store = nullptr; + + for (; It != E; ++It) { + if (It->isLoadFoldBarrier() && !It->mayStore()) + return false; + + if (!It->mayStore()) + continue; + + if (IsValidStore(*It)) { + Store = &*It; + break; + } + + if (MI.mayAlias(/* AA */ nullptr, *It, /* UseTBAA */ false)) + return false; + } + + if (!Store) + return false; + + LLVM_DEBUG(dbgs() << "Replacing load instruction "; MI.dump()); + LLVM_DEBUG(dbgs() << " Related Store is "; Store->dump()); + + auto StoreOp = Store->explicit_operands().begin(); + if (StoreOp->isKill()) + StoreOp->setIsKill(false); + + MachineInstr *Copy = BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .add(*StoreOp); + (void)Copy; + LLVM_DEBUG(dbgs() << "with COPY "; Copy->dump()); + + return true; +} + // This function returns a list of all PHI nodes in the tree starting from // the RootPHI node. We perform a BFS traversal to get an ordered list of nodes. // The list initially only contains the root PHI. When we visit a PHI node, we @@ -440,6 +551,11 @@ if (MI.isDebugInstr()) continue; + if (MI.mayLoad() && optimizeLoad(MI)) { + ToErase = &MI; + continue; + } + // Per-opcode peepholes. switch (MI.getOpcode()) { diff --git a/llvm/test/CodeGen/PowerPC/byval-lhs.ll b/llvm/test/CodeGen/PowerPC/byval-lhs.ll --- a/llvm/test/CodeGen/PowerPC/byval-lhs.ll +++ b/llvm/test/CodeGen/PowerPC/byval-lhs.ll @@ -17,7 +17,6 @@ ; LE-NEXT: stdu r1, -48(r1) ; LE-NEXT: std r0, 64(r1) ; LE-NEXT: std r3, 40(r1) -; LE-NEXT: ld r3, 40(r1) ; LE-NEXT: bl f0 ; LE-NEXT: nop ; LE-NEXT: addi r1, r1, 48 @@ -31,7 +30,6 @@ ; AIX-NEXT: stdu r1, -128(r1) ; AIX-NEXT: std r0, 144(r1) ; AIX-NEXT: std r3, 120(r1) -; AIX-NEXT: ld r3, 120(r1) ; AIX-NEXT: bl .f0[PR] ; AIX-NEXT: nop ; AIX-NEXT: addi r1, r1, 128 @@ -49,11 +47,10 @@ ; LE: # %bb.0: ; LE-NEXT: mflr r0 ; LE-NEXT: stdu r1, -48(r1) +; LE-NEXT: mr r4, r3 ; LE-NEXT: std r0, 64(r1) ; LE-NEXT: std r3, 32(r1) ; LE-NEXT: std r3, 40(r1) -; LE-NEXT: ld r4, 40(r1) -; LE-NEXT: ld r3, 32(r1) ; LE-NEXT: bl f1 ; LE-NEXT: nop ; LE-NEXT: addi r1, r1, 48 @@ -65,11 +62,10 @@ ; AIX: # %bb.0: ; AIX-NEXT: mflr r0 ; AIX-NEXT: stdu r1, -128(r1) +; AIX-NEXT: mr r4, r3 ; AIX-NEXT: std r0, 144(r1) ; AIX-NEXT: std r3, 112(r1) ; AIX-NEXT: std r3, 120(r1) -; AIX-NEXT: ld r4, 120(r1) -; AIX-NEXT: ld r3, 112(r1) ; AIX-NEXT: bl .f1[PR] ; AIX-NEXT: nop ; AIX-NEXT: addi r1, r1, 128 diff --git a/llvm/test/CodeGen/PowerPC/convert-load-to-copy.mir b/llvm/test/CodeGen/PowerPC/convert-load-to-copy.mir --- a/llvm/test/CodeGen/PowerPC/convert-load-to-copy.mir +++ b/llvm/test/CodeGen/PowerPC/convert-load-to-copy.mir @@ -18,8 +18,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY $x3 ; CHECK-NEXT: STD [[COPY]], 0, %stack.0 :: (store (s64) into %stack.0) - ; CHECK-NEXT: [[LD:%[0-9]+]]:g8rc = LD 0, %stack.0 :: (load (s64) from %stack.0) - ; CHECK-NEXT: $x3 = COPY [[LD]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:g8rc = COPY [[COPY]] + ; CHECK-NEXT: $x3 = COPY [[COPY1]] ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm %0:g8rc = COPY $x3 STD %0:g8rc, 0, %stack.0 :: (store (s64) into %stack.0) @@ -137,8 +137,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY $x3 ; CHECK-NEXT: STDX [[COPY]], $x4, %stack.0 :: (store (s64) into %stack.0) - ; CHECK-NEXT: [[LDX:%[0-9]+]]:g8rc = LDX $x4, %stack.0 :: (load (s64) from %stack.0) - ; CHECK-NEXT: $x3 = COPY [[LDX]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:g8rc = COPY [[COPY]] + ; CHECK-NEXT: $x3 = COPY [[COPY1]] ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm %0:g8rc = COPY $x3 STDX %0:g8rc, $x4, %stack.0 :: (store (s64) into %stack.0) @@ -206,8 +206,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY $x3 ; CHECK-NEXT: STD [[COPY]], 0, %stack.0 :: (store monotonic (s64) into %stack.0) - ; CHECK-NEXT: [[LD:%[0-9]+]]:g8rc = LD 0, %stack.0 :: (load (s64) from %stack.0) - ; CHECK-NEXT: $x3 = COPY [[LD]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:g8rc = COPY [[COPY]] + ; CHECK-NEXT: $x3 = COPY [[COPY1]] ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm %0:g8rc = COPY $x3 STD %0:g8rc, 0, %stack.0 :: (store monotonic (s64) into %stack.0) diff --git a/llvm/test/CodeGen/PowerPC/vsx-p9.ll b/llvm/test/CodeGen/PowerPC/vsx-p9.ll --- a/llvm/test/CodeGen/PowerPC/vsx-p9.ll +++ b/llvm/test/CodeGen/PowerPC/vsx-p9.ll @@ -137,8 +137,8 @@ tail call void (...) @sink(<2 x double> %add.i12) ; CHECK: lxv 0, 0(3) ; CHECK: lxv 1, 0(3) -; CHECK: xvadddp 0, 0, 1 -; CHECK: stxv 0, +; CHECK: xvadddp 34, 0, 1 +; CHECK: stxv 34, ; CHECK: bl sink ret void }