Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1623,7 +1623,7 @@ /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { return nullptr; Index: llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -1672,6 +1672,18 @@ } break; } + + case ISD::FADD: { + // Wait with reg/mem folding if reassociation is allowed. Use a pseudo + // that clobbers CC during isel to help later load folding into ADB. + if (Node->getValueType(0) == MVT::f64 && + Node->getFlags().hasAllowReassociation() && + Node->getFlags().hasNoSignedZeros()) { + CurDAG->SelectNodeTo(Node, SystemZ::WFADB_CCPseudo, MVT::f64, + Node->getOperand(0), Node->getOperand(1)); + return; + } + } } SelectCode(Node); Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -9040,6 +9040,11 @@ case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, MBB); + case SystemZ::WFADB_CCPseudo: + MI.setDesc(Subtarget.getInstrInfo()->get(SystemZ::WFADB)); + MI.removeOperand(3); // CC + return MBB; + default: llvm_unreachable("Unexpected instr type to insert"); } Index: llvm/lib/Target/SystemZ/SystemZInstrFormats.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5388,3 +5388,13 @@ [(set GR64:$end, (operator GR64:$start1, GR64:$start2, GR32:$char))]>; } + +multiclass BinaryVRRcAndCCPseudo opcode, + SDPatternOperator operator, TypedReg tr1, + TypedReg tr2, bits<4> type = 0, bits<4> m5 = 0, + bits<4> m6 = 0, string fp_mnemonic = ""> { + def "" : BinaryVRRc; + let Defs = [CC], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in + def _CCPseudo : Pseudo<(outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3), []>; +} Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -243,6 +243,10 @@ const DebugLoc &DL, Register DstReg, ArrayRef Cond, Register TrueReg, Register FalseReg) const override; + MachineInstr *optimizeLoadInstr(MachineInstr &MI, + MachineRegisterInfo *MRI, + Register &FoldAsLoadDefReg, + MachineInstr *&DefMI) const override; bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override; bool isPredicable(const MachineInstr &MI) const override; @@ -274,6 +278,11 @@ Register VReg) const override; MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override; + + bool useMachineCombiner() const override { return true; } + bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const override; + MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -610,6 +610,79 @@ .addImm(CCValid).addImm(CCMask); } +static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { + if (OldMI->registerDefIsDead(SystemZ::CC)) { + MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC); + if (CCDef != nullptr) + CCDef->setIsDead(true); + } +} + +static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, + MachineInstr::MIFlag Flag) { + if (OldMI->getFlag(Flag)) + NewMI->setFlag(Flag); +} + +MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI, + MachineRegisterInfo *MRI, + Register &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { + // Check whether we can move DefMI here. + DefMI = MRI->getVRegDef(FoldAsLoadDefReg); + assert(DefMI); + bool SawStore = false; + if (!DefMI->isSafeToMove(nullptr, SawStore)) + return nullptr; + + // For reassociatable FP additions, any loads have been purposefully been + // left unfolded so that MachineCombiner can do its work on reg/reg + // opcodes. After that has been done as many loads as possible are now + // folded into reg/mem instructions. + if (MI.getOpcode() == SystemZ::WFADB && DefMI->getOpcode() == SystemZ::VL64 && + MRI->hasOneNonDBGUse(FoldAsLoadDefReg)) { + MachineBasicBlock *MBB = MI.getParent(); + Register DstReg = MI.getOperand(0).getReg(); + MachineOperand LHS = MI.getOperand(1); + MachineOperand RHS = MI.getOperand(2); + MachineOperand &SrcMO = LHS.getReg() == FoldAsLoadDefReg ? RHS : LHS; + // Only use the 2-address ADB if there is no other use of SrcMO in MBB. + for (auto &UseMI : MRI->use_nodbg_instructions(SrcMO.getReg())) + if (UseMI.getParent() == MBB && &UseMI != &MI) + return nullptr; + + // Make sure CC is not live at this point as ADB clobbers it. + MachineBasicBlock::iterator I = std::next(MI.getIterator()); + for (; I != MBB->end(); ++I) { + if (I->readsRegister(SystemZ::CC)) + return nullptr; + if (I->modifiesRegister(SystemZ::CC)) + break; + } + if (I == MBB->end()) { + LivePhysRegs LiveRegs(RI); + LiveRegs.addLiveOuts(*MBB); + if (LiveRegs.contains(SystemZ::CC)) + return nullptr; + } + + MachineInstrBuilder MIB = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(SystemZ::ADB), DstReg) + .add(SrcMO) + .add(DefMI->getOperand(1)) + .add(DefMI->getOperand(2)) + .add(DefMI->getOperand(3)) + .addMemOperand(*DefMI->memoperands_begin()); + transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept); + MRI->setRegClass(SrcMO.getReg(), &SystemZ::FP64BitRegClass); + MRI->setRegClass(DstReg, &SystemZ::FP64BitRegClass); + MIB->getOperand(5).setIsDead(); // CC implicit def + return MIB; + } + + return nullptr; +} + bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { @@ -937,20 +1010,6 @@ } } -static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { - if (OldMI->registerDefIsDead(SystemZ::CC)) { - MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC); - if (CCDef != nullptr) - CCDef->setIsDead(true); - } -} - -static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, - MachineInstr::MIFlag Flag) { - if (OldMI->getFlag(Flag)) - NewMI->setFlag(Flag); -} - MachineInstr * SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { @@ -1003,6 +1062,22 @@ return nullptr; } +bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const { + if (Invert) + return false; // TODO..? + + switch (Inst.getOpcode()) { + default: break; + // TODO: Other opcodes. + case SystemZ::WFADB: + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); + } + + return false; +} + MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, Index: llvm/lib/Target/SystemZ/SystemZInstrVector.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -139,7 +139,7 @@ // LEY and LDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - let mayLoad = 1 in { + let mayLoad = 1, canFoldAsLoad = 1 in { def VL32 : UnaryAliasVRX; def VL64 : UnaryAliasVRX; } @@ -1047,8 +1047,8 @@ let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>; - def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8, 0, - "adbr">; + defm WFADB : BinaryVRRcAndCCPseudo<"wfadb", 0xE7E3, any_fadd, v64db, v64db, + 3, 8, 0, "adbr">; let Predicates = [FeatureVectorEnhancements1] in { def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>; def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8, 0, Index: llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -30,6 +30,11 @@ using namespace llvm; +static cl::opt +EnableMachineCombinerPass("systemz-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() { // Register the target. RegisterTargetMachine X(getTheSystemZTarget()); @@ -240,6 +245,10 @@ bool SystemZPassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); + return true; } Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -541,7 +541,7 @@ /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. MachineInstr *optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4678,7 +4678,7 @@ /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { // Check whether we can move DefMI here. Index: llvm/test/CodeGen/SystemZ/fp-add-reassoc-01.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/fp-add-reassoc-01.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs | FileCheck %s + +define double @fun(ptr %x) { +; CHECK-LABEL: fun: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: adb %f0, 8(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: adb %f1, 16(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: adb %f1, 32(%r2) +; CHECK-NEXT: adb %f1, 48(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: adb %f0, 56(%r2) +; CHECK-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + ret double %add13 +}