Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -79,7 +79,8 @@ FMLSv4f32_OP1, FMLSv4f32_OP2, FMLSv4i32_indexed_OP1, - FMLSv4i32_indexed_OP2 + FMLSv4i32_indexed_OP2, + SHLD2SHIFTS }; } // end namespace llvm Index: lib/CodeGen/TargetSchedule.cpp =================================================================== --- lib/CodeGen/TargetSchedule.cpp +++ lib/CodeGen/TargetSchedule.cpp @@ -188,7 +188,8 @@ const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const { - if (!hasInstrSchedModel() && !hasInstrItineraries()) + if ((!hasInstrSchedModel() && !hasInstrItineraries()) || + ((int)DefOperIdx < 0 || (((int)UseOperIdx < 0) && UseMI))) return TII->defaultDefLatency(SchedModel, *DefMI); if (hasInstrItineraries()) { Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -34402,7 +34402,9 @@ // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. - if (!OptForSize && Subtarget.isSHLDSlow()) + // TODO: for 64-bit we'll generate SHLD code + if (!OptForSize && Subtarget.isSHLDSlow() && + !(Subtarget.is64Bit() && Subtarget.getInstrInfo()->isMCShldEnabled())) return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -532,7 +532,23 @@ const MachineInstr &UseMI, unsigned UseIdx) const override; - bool useMachineCombiner() const override { return true; } + bool useMachineCombiner() const override; + bool isMCShldEnabled() const; + + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// listed in the array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const override; bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -70,6 +70,9 @@ cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden); +static cl::opt + MCShldEnabled("mc-shld-enabled", cl::Hidden, cl::init(false), + cl::desc("Enable Machine Combiner SHLD substitution")); enum { // Select which memory operand is being unfolded. @@ -11309,3 +11312,319 @@ return It; } + +bool X86InstrInfo::isMCShldEnabled() const { + return MCShldEnabled & Subtarget.getSchedModel().hasInstrSchedModel(); +} + +static void genShldCl(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned DestReg = Root.getOperand(1).getReg(); + unsigned SrcReg = Root.getOperand(2).getReg(); + unsigned CLReg = Root.getOperand(4).getReg(); + assert(CLReg == X86::CL && "It must be CL register!!!"); + + bool isDead = Root.getOperand(0).isDead(); + bool isKill = Root.getOperand(1).isKill(); + + unsigned VShlReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShlReg, 0)); + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), VShlReg) + .addReg(DestReg, getKillRegState(isKill)); + InsInstrs.push_back(ShlMI); + + unsigned MovReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(MovReg, 0)); + MachineInstrBuilder MovMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1])) + .addReg(MovReg, RegState::Define) + .addImm(64); + InsInstrs.push_back(MovMI); + + unsigned SubReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(SubReg, 0)); + MachineInstrBuilder SubMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2])) + .addReg(SubReg, RegState::Define) + .addReg(MovReg, RegState::Kill) + .addReg(CLReg, RegState::Kill); + InsInstrs.push_back(SubMI); + + MachineInstrBuilder MovMI2 = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[3])) + .addReg(X86::RCX, RegState::Define) + .addReg(SubReg); + InsInstrs.push_back(MovMI2); + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[4]), VShrReg) + .addReg(SrcReg, RegState::Kill); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder OrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5])) + .addReg(ResultReg, RegState::Define | getDeadRegState(isDead)) + .addReg(VShrReg, RegState::Kill) + .addReg(VShlReg, RegState::Kill); + InsInstrs.push_back(OrMI); +} + +static void genShldMri(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, int8_t NBits) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned BaseReg = Root.getOperand(0).getReg(); + // TODO: at the moment we support PC-relative addressing mode only. + assert(BaseReg == X86::RIP || BaseReg == X86::EIP || BaseReg == X86::IP); + unsigned Scale = Root.getOperand(1).getImm(); + unsigned IndexReg = Root.getOperand(2).getReg(); + MachineOperand MO = Root.getOperand(3); + + // TODO: we should support all possible types of Disp + const GlobalValue *GlobalDisp = nullptr; + if (MO.isGlobal()) + GlobalDisp = MO.getGlobal(); + else + return; // TODO: add other possible mem operands + + unsigned SegReg = Root.getOperand(4).getReg(); + unsigned SrcReg = Root.getOperand(5).getReg(); + unsigned Shift = Root.getOperand(6).getImm(); + + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0])) + .addReg(BaseReg) + .addImm(Scale) + .addReg(IndexReg) + .addGlobalAddress(GlobalDisp) + .addReg(SegReg) + .addImm(Shift); + InsInstrs.push_back(ShlMI); + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), VShrReg) + .addReg(SrcReg) + .addImm(NBits - Shift); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder OrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2])) + .addReg(BaseReg) + .addImm(Scale) + .addReg(IndexReg) + .addGlobalAddress(GlobalDisp) + .addReg(SegReg) + .addReg(VShrReg); + InsInstrs.push_back(OrMI); +} +// We try to replace +// shldq $12, %rsi, %rdi # sched: [3:3.00] +// with +// shlq $12, %rdi # sched: [1:0.50] +// shrq $52, %rsi # sched: [1:0.50] +// leaq (%rsi,%rdi), %rax # sched: [1:0.50] +// iff the last is faster. + +static void genShldRri(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, int8_t NBits) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned DestReg = Root.getOperand(1).getReg(); + unsigned SrcReg = Root.getOperand(2).getReg(); + + // FIXME: Is it possible to zero here? + if (int64_t Imm8 = Root.getOperand(3).getImm()) { + // Left shift could be replaced with scale factor in case if + // shift value is equal to 1, 2 or 4 + unsigned VShlReg = DestReg; + unsigned Scale = 1; + switch (Imm8) { + case 1: + Scale = 2; + break; + case 2: + Scale = 4; + break; + case 4: + Scale = 8; + break; + default: + VShlReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShlReg, 0)); + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), VShlReg) + .addReg(DestReg) + .addImm(Imm8); + InsInstrs.push_back(ShlMI); + } + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), VShrReg) + .addReg(SrcReg) + .addImm(NBits - Imm8); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder LeaMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2]), ResultReg) + .addReg(VShrReg) + .addImm(Scale) + .addReg(VShlReg) + .addImm(0) // Disp + .addReg(0); // SegReg + InsInstrs.push_back(LeaMI); + } +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::SHLD2SHIFTS: + switch (Root.getOpcode()) { + default: + return; + // TODO: do we need sequences for SHLD16mrCL, SHLD32mrCL, SHLD64mrCL? + case X86::SHLD16mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16mi, X86::SHR16ri, X86::OR16mr}, 16); + break; + case X86::SHLD16rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16rCL, X86::MOV16ri, X86::SUB16rr, X86::MOV16ri, + X86::SHR16rCL, X86::OR16rr}); + break; + case X86::SHLD16rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16ri, X86::SHR16ri, X86::LEA16r}, 16); + break; + case X86::SHLD32mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32mi, X86::SHR32ri, X86::OR32mr}, 32); + break; + case X86::SHLD32rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32rCL, X86::MOV32ri, X86::SUB32rr, X86::MOV32ri, + X86::SHR32rCL, X86::OR32rr}); + break; + case X86::SHLD32rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32ri, X86::SHR32ri, X86::LEA32r}, 32); + break; + case X86::SHLD64mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64mi, X86::SHR64ri, X86::OR64mr}, 64); + break; + case X86::SHLD64rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64rCL, X86::MOV64ri, X86::SUB64rr, X86::MOV64ri, + X86::SHR64rCL, X86::OR64rr}); + break; + case X86::SHLD64rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64ri, X86::SHR64ri, X86::LEA64r}, 64); + break; + } + DelInstrs.push_back(&Root); // Record SHLD/SHRD for deletion + break; + } +} + +/// Find SHLD/SHRD instructions +static bool getSHLDPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + switch (Root.getOpcode()) { + default: + return false; + // TODO: do we need sequences for SHLD(XX)mrCL ? + // It seems they are too long: + // movq x(%rip), %rax # sched: [5:1.00] + // movl %esi, %ecx # sched: [1:0.50] + // shlq %cl, %rax # sched: [1:0.50] + // movl $64, %ecx # sched: [1:0.50] + // subl %esi, %ecx # sched: [1:0.50] + // shrq %cl, %rdi # sched: [1:0.50] + // orq %rax, %rdi # sched: [1:0.50] + // movq %rdi, x(%rip) # sched: [1:1.00] + // TODO: At the moment we support 64-bit only + // case X86::SHLD16mri8: + // case X86::SHLD16rrCL: + // case X86::SHLD16rri8: + // case X86::SHLD32mri8: + // case X86::SHLD32rrCL: + // case X86::SHLD32rri8: + case X86::SHLD64mri8: + case X86::SHLD64rrCL: + case X86::SHLD64rri8: + break; + } + Patterns.push_back(MachineCombinerPattern::SHLD2SHIFTS); + return true; +} + +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + // SHLD patterns + if (isMCShldEnabled() && getSHLDPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + +bool X86InstrInfo::useMachineCombiner() const { + // return Subtarget.getSchedModel().hasInstrSchedModel(); + return true; +} Index: test/CodeGen/X86/schedule-x86-64-shld-debug.ll =================================================================== --- test/CodeGen/X86/schedule-x86-64-shld-debug.ll +++ test/CodeGen/X86/schedule-x86-64-shld-debug.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mc-shld-enabled -mcpu=x86-64 -machine-combiner-dump-subst-intrs -debug-only=machine-combiner 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mc-shld-enabled -mcpu=btver2 -machine-combiner-dump-subst-intrs -debug-only=machine-combiner 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 + +; uint64_t lshift10(uint64_t a, uint64_t b) +; { +; return (a << 10) | (b >> 54); +; } + +define i64 @lshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize { +; CHECK-LABEL: lshift10_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shldq $10, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, 10 + %shr = lshr i64 %b, 54 + %or = or i64 %shr, %shl + ret i64 %or +} + +define i64 @lshift10(i64 %a, i64 %b) nounwind readnone { +; CHECK-LABEL: lshift10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shldq $10, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, 10 + %shr = lshr i64 %b, 54 + %or = or i64 %shr, %shl + ret i64 %or +} + +; uint64_t rshift10(uint64_t a, uint64_t b) +; { +; return (a >> 62) | (b << 2); +; } + +; Should be done via shld +define i64 @rshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize { +; CHECK-LABEL: rshift10_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shrdq $62, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = lshr i64 %a, 62 + %shr = shl i64 %b, 2 + %or = or i64 %shr, %shl + ret i64 %or +} + +; Should be done via lea (x,y,4),z +define i64 @rshift10(i64 %a, i64 %b) nounwind readnone { +; CHECK-LABEL: rshift10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shrdq $62, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = lshr i64 %a, 62 + %shr = shl i64 %b, 2 + %or = or i64 %shr, %shl + ret i64 %or +} + +;uint64_t lshift(uint64_t a, uint64_t b, uint64_t c) +;{ +; return (a << c) | (b >> (64-c)); +;} + +define i64 @lshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize { +; CHECK-LABEL: lshift_cl_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shldq %cl, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, %c + %sub = sub nsw i64 64, %c + %shr = lshr i64 %b, %sub + %or = or i64 %shr, %shl + ret i64 %or +} + +define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone { +; CHECK-LABEL: lshift_cl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shldq %cl, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, %c + %sub = sub nsw i64 64, %c + %shr = lshr i64 %b, %sub + %or = or i64 %shr, %shl + ret i64 %or +} + + +;uint64_t rshift(uint64_t a, uint64_t b, int c) +;{ +; return (a >> c) | (b << (64-c)); +;} + +define i64 @rshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize { +; CHECK-LABEL: rshift_cl_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shrdq %cl, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shr = lshr i64 %a, %c + %sub = sub nsw i64 64, %c + %shl = shl i64 %b, %sub + %or = or i64 %shr, %shl + ret i64 %or +} + +define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone { +; CHECK-LABEL: rshift_cl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shrdq %cl, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shr = lshr i64 %a, %c + %sub = sub nsw i64 64, %c + %shl = shl i64 %b, %sub + %or = or i64 %shr, %shl + ret i64 %or +} + +; extern uint64_t x; +;void lshift(uint64_t a, uint64_t b, uint_64_t c) +;{ +; x = (x << c) | (a >> (64-c)); +;} +@x = global i64 0, align 4 + +define void @lshift_mem_cl_optsize(i64 %a, i64 %c) nounwind readnone optsize { +; CHECK-LABEL: lshift_mem_cl_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: shldq %cl, %rdi, {{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %b = load i64, i64* @x + %shl = shl i64 %b, %c + %sub = sub nsw i64 64, %c + %shr = lshr i64 %a, %sub + %or = or i64 %shl, %shr + store i64 %or, i64* @x + ret void +} + +define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone { +; CHECK-LABEL: lshift_mem_cl: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: shldq %cl, %rdi, {{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %b = load i64, i64* @x + %shl = shl i64 %b, %c + %sub = sub nsw i64 64, %c + %shr = lshr i64 %a, %sub + %or = or i64 %shl, %shr + store i64 %or, i64* @x + ret void +} + +define void @lshift_mem(i64 %a) nounwind readnone { +; GENERIC-LABEL: lshift_mem: +; GENERIC: # %bb.0: # %enMachine InstCombiner: lshift_mem_optsize +; GENERIC-NEXT: Combining MBB entry +; GENERIC-NEXT: For the Pattern (56) these instructions could be removed +; GENERIC-NEXT: sched: [8:1.00]: SHLD64mri8 $rip, 1, $noreg, @x, $noreg, %0, 10, implicit-def dead $eflags :: (store 8 into @x), (dereferenceable load 8 from @x) +; GENERIC-NEXT: These instructions could replace the removed ones +; GENERIC-NEXT: sched: [7:1.00]: SHL64mi $physreg50, 1, $noreg, @x, $noreg, 10, implicit-def $physreg25 +; GENERIC-NEXT: sched: [1:0.50]: %1 = SHR64ri %0, 54, implicit-def $physreg25 +; GENERIC-NEXT: sched: [7:1.00]: OR64mr $physreg50, 1, $noreg, @x, $noreg, %1, implicit-def $physreg25 +; GENERIC-NEXT: Dependence data for SHLD64mri8 $rip, 1, $noreg, @x, $noreg, %0:gr64, 10, implicit-def dead $eflags :: (store 8 into @x), (dereferenceable load 8 from @x) +; GENERIC-NEXT: NewRootDepth: 4 RootDepth: 0 +; GENERIC-NEXT: NewRootLatency: 8 RootLatency: 8 +; GENERIC-NEXT: RootSlack: 0 SlackIsAccurate=1 +; GENERIC-NEXT: NewRootDepth + NewRootLatency = 12 +; GENERIC-NEXT: RootDepth + RootLatency + RootSlack = 8 +; GENERIC-NEXT: It DOES NOT improve PathLen because +; GENERIC-NEXT: NewCycleCount = 12, OldCycleCount = 8 +; GENERIC-NEXT: Machine InstCombiner: lshift_mem_b +; GENERIC-NEXT: Combining MBB entry +; GENERIC-NEXT: For the Pattern (56) these instructions could be removed +; GENERIC-NEXT: sched: [2:0.67]: %2:gr64 = SHLD64rri8 %0, killed %1, 10, implicit-def dead $eflags +; GENERIC-NEXT: These instructions could replace the removed ones +; GENERIC-NEXT: sched: [1:0.50]: %3 = SHL64ri %0, 10, implicit-def $physreg25 +; GENERIC-NEXT: sched: [1:0.50]: %4 = SHR64ri %1, 54, implicit-def $physreg25 +; GENERIC-NEXT: sched: [1:0.50]: %2 = LEA64r %4, 1, %3, 0, $noreg +; GENERIC-NEXT: Dependence data for %2:gr64 = SHLD64rri8 %0:gr64, killed %1:gr64, 10, implicit-def dead $eflags +; GENERIC-NEXT: NewRootDepth: 1 RootDepth: 5 +; GENERIC-NEXT: NewRootLatency: 3 RootLatency: 2 +; GENERIC-NEXT: RootSlack: 0 SlackIsAccurate=1 +; GENERIC-NEXT: NewRootDepth + NewRootLatency = 4 +; GENERIC-NEXT: RootDepth + RootLatency + RootSlack = 7 +; GENERIC-NEXT: It IMPROVES PathLen because +; GENERIC-NEXT: NewCycleCount = 4, OldCycleCount = 7 +; GENERIC-NEXT: Resource length before replacement: 3 and after: 3 +; GENERIC-NEXT: As result it IMPROVES/PRESERVES Resource Length +; GENERIC-NEXT: Machine InstCombiner: lshift_mem_b_optsize +; GENERIC-NEXT: Combining MBB entry +; GENERIC-NEXT: For the Pattern (56) these instructions could be removed +; GENERIC-NEXT: sched: [2:0.67]: %2:gr64 = SHLD64rri8 %0, killed %1, 10, implicit-def dead $eflags +; GENERIC-NEXT: These instructions could replace the removed ones +; GENERIC-NEXT: sched: [1:0.50]: %3 = SHL64ri %0, 10, implicit-def $physreg25 +; GENERIC-NEXT: sched: [1:0.50]: %4 = SHR64ri %1, 54, implicit-def $physreg25 +; GENERIC-NEXT: sched: [1:0.50]: %2 = LEA64r %4, 1, %3, 0, $noreg +; GENERIC-NEXT: Dependence data for %2:gr64 = SHLD64rri8 %0:gr64, killed %1:gr64, 10, implicit-def dead $eflags +; GENERIC-NEXT: NewRootDepth: 1 RootDepth: 5 +; GENERIC-NEXT: NewRootLatency: 3 RootLatency: 2 +; GENERIC-NEXT: RootSlack: 0 SlackIsAccurate=1 +; GENERIC-NEXT: NewRootDepth + NewRootLatency = 4 +; GENERIC-NEXT: RootDepth + RootLatency + RootSlack = 7 +; GENERIC-NEXT: It IMPROVES PathLen because +; GENERIC-NEXT: NewCycleCount = 4, OldCycleCount = 7 +; GENERIC-NEXT: Resource length before replacement: 3 and after: 3 +; GENERIC-NEXT: As result it IMPROVES/PRESERVES Resource Length +; GENERIC-NEXT: try +; GENERIC-NEXT: shldq $10, %rdi, {{.*}}(%rip) +; GENERIC-NEXT: retq +; +; BTVER2-LABEL: lshift_mem: +; BTVER2: # %bb.0: # %enMachine InstCombiner: lshift_mem_optsize +; BTVER2-NEXT: Combining MBB entry +; BTVER2-NEXT: For the Pattern (56) these instructions could be removed +; BTVER2-NEXT: sched: [9:11.00]: SHLD64mri8 $rip, 1, $noreg, @x, $noreg, %0, 10, implicit-def dead $eflags :: (store 8 into @x), (dereferenceable load 8 from @x) +; BTVER2-NEXT: These instructions could replace the removed ones +; BTVER2-NEXT: sched: [4:1.00]: SHL64mi $physreg50, 1, $noreg, @x, $noreg, 10, implicit-def $physreg25 +; BTVER2-NEXT: sched: [1:0.50]: %1 = SHR64ri %0, 54, implicit-def $physreg25 +; BTVER2-NEXT: sched: [4:1.00]: OR64mr $physreg50, 1, $noreg, @x, $noreg, %1, implicit-def $physreg25 +; BTVER2-NEXT: Dependence data for SHLD64mri8 $rip, 1, $noreg, @x, $noreg, %0:gr64, 10, implicit-def dead $eflags :: (store 8 into @x), (dereferenceable load 8 from @x) +; BTVER2-NEXT: NewRootDepth: 5 RootDepth: 0 +; BTVER2-NEXT: NewRootLatency: 5 RootLatency: 9 +; BTVER2-NEXT: RootSlack: 0 SlackIsAccurate=1 +; BTVER2-NEXT: NewRootDepth + NewRootLatency = 10 +; BTVER2-NEXT: RootDepth + RootLatency + RootSlack = 9 +; BTVER2-NEXT: It DOES NOT improve PathLen because +; BTVER2-NEXT: NewCycleCount = 10, OldCycleCount = 9 +; BTVER2-NEXT: Machine InstCombiner: lshift_mem_b +; BTVER2-NEXT: Combining MBB entry +; BTVER2-NEXT: For the Pattern (56) these instructions could be removed +; BTVER2-NEXT: sched: [3:3.00]: %2:gr64 = SHLD64rri8 %0, killed %1, 10, implicit-def dead $eflags +; BTVER2-NEXT: These instructions could replace the removed ones +; BTVER2-NEXT: sched: [1:0.50]: %3 = SHL64ri %0, 10, implicit-def $physreg25 +; BTVER2-NEXT: sched: [1:0.50]: %4 = SHR64ri %1, 54, implicit-def $physreg25 +; BTVER2-NEXT: sched: [1:0.50]: %2 = LEA64r %4, 1, %3, 0, $noreg +; BTVER2-NEXT: Dependence data for %2:gr64 = SHLD64rri8 %0:gr64, killed %1:gr64, 10, implicit-def dead $eflags +; BTVER2-NEXT: NewRootDepth: 2 RootDepth: 5 +; BTVER2-NEXT: NewRootLatency: 3 RootLatency: 3 +; BTVER2-NEXT: RootSlack: 0 SlackIsAccurate=1 +; BTVER2-NEXT: NewRootDepth + NewRootLatency = 5 +; BTVER2-NEXT: RootDepth + RootLatency + RootSlack = 8 +; BTVER2-NEXT: It IMPROVES PathLen because +; BTVER2-NEXT: NewCycleCount = 5, OldCycleCount = 8 +; BTVER2-NEXT: Resource length before replacement: 7 and after: 6 +; BTVER2-NEXT: As result it IMPROVES/PRESERVES Resource Length +; BTVER2-NEXT: Machine InstCombiner: lshift_mem_b_optsize +; BTVER2-NEXT: Combining MBB entry +; BTVER2-NEXT: For the Pattern (56) these instructions could be removed +; BTVER2-NEXT: sched: [3:3.00]: %2:gr64 = SHLD64rri8 %0, killed %1, 10, implicit-def dead $eflags +; BTVER2-NEXT: These instructions could replace the removed ones +; BTVER2-NEXT: sched: [1:0.50]: %3 = SHL64ri %0, 10, implicit-def $physreg25 +; BTVER2-NEXT: sched: [1:0.50]: %4 = SHR64ri %1, 54, implicit-def $physreg25 +; BTVER2-NEXT: sched: [1:0.50]: %2 = LEA64r %4, 1, %3, 0, $noreg +; BTVER2-NEXT: Dependence data for %2:gr64 = SHLD64rri8 %0:gr64, killed %1:gr64, 10, implicit-def dead $eflags +; BTVER2-NEXT: NewRootDepth: 2 RootDepth: 5 +; BTVER2-NEXT: NewRootLatency: 3 RootLatency: 3 +; BTVER2-NEXT: RootSlack: 0 SlackIsAccurate=1 +; BTVER2-NEXT: NewRootDepth + NewRootLatency = 5 +; BTVER2-NEXT: RootDepth + RootLatency + RootSlack = 8 +; BTVER2-NEXT: It IMPROVES PathLen because +; BTVER2-NEXT: NewCycleCount = 5, OldCycleCount = 8 +; BTVER2-NEXT: Resource length before replacement: 7 and after: 6 +; BTVER2-NEXT: As result it IMPROVES/PRESERVES Resource Length +; BTVER2-NEXT: try +; BTVER2-NEXT: shldq $10, %rdi, {{.*}}(%rip) +; BTVER2-NEXT: retq +entry: + %b = load i64, i64* @x + %shl = shl i64 %b, 10 + %shr = lshr i64 %a, 54 + %or = or i64 %shr, %shl + store i64 %or, i64* @x + ret void +} + +define void @lshift_mem_optsize(i64 %a) nounwind readnone optsize { +; CHECK-LABEL: lshift_mem_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shldq $10, %rdi, {{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %b = load i64, i64* @x + %shl = shl i64 %b, 10 + %shr = lshr i64 %a, 54 + %or = or i64 %shr, %shl + store i64 %or, i64* @x + ret void +} + +define void @lshift_mem_b(i64 %b) nounwind readnone { +; CHECK-LABEL: lshift_mem_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: shlq $10, %rdi +; CHECK-NEXT: shrq $54, %rax +; CHECK-NEXT: leaq (%rax,%rdi), %rax +; CHECK-NEXT: movq %rax, {{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %a = load i64, i64* @x + %shl = shl i64 %b, 10 + %shr = lshr i64 %a, 54 + %or = or i64 %shr, %shl + store i64 %or, i64* @x + ret void +} + +define void @lshift_mem_b_optsize(i64 %b) nounwind readnone optsize { +; CHECK-LABEL: lshift_mem_b_optsize: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq {{.*}}(%rip), %rax +; CHECK-NEXT: shlq $10, %rdi +; CHECK-NEXT: shrq $54, %rax +; CHECK-NEXT: leaq (%rax,%rdi), %rax +; CHECK-NEXT: movq %rax, {{.*}}(%rip) +; CHECK-NEXT: retq +entry: + %a = load i64, i64* @x + %shl = shl i64 %b, 10 + %shr = lshr i64 %a, 54 + %or = or i64 %shr, %shl + store i64 %or, i64* @x + ret void +} + Index: test/CodeGen/X86/schedule-x86-64-shld.ll =================================================================== --- test/CodeGen/X86/schedule-x86-64-shld.ll +++ test/CodeGen/X86/schedule-x86-64-shld.ll @@ -1,10 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mc-shld-enabled -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mc-shld-enabled -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mc-shld-enabled -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER1 - -; uint64_t lshift10(uint64_t a, uint64_t b) ; { ; return (a << 10) | (b >> 54); ; } @@ -43,9 +41,8 @@ ; ; BTVER2-LABEL: lshift10: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: shlq $10, %rdi # sched: [1:0.50] -; BTVER2-NEXT: shrq $54, %rsi # sched: [1:0.50] -; BTVER2-NEXT: leaq (%rsi,%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: shldq $10, %rsi, %rdi # sched: [3:3.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: lshift10: @@ -102,8 +99,8 @@ ; ; BTVER2-LABEL: rshift10: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: shrq $62, %rdi # sched: [1:0.50] -; BTVER2-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50] +; BTVER2-NEXT: shrdq $62, %rsi, %rdi # sched: [3:3.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: rshift10: @@ -163,13 +160,8 @@ ; BTVER2-LABEL: lshift_cl: ; BTVER2: # %bb.0: # %entry ; BTVER2-NEXT: movl %edx, %ecx # sched: [1:0.50] -; BTVER2-NEXT: shlq %cl, %rdi # sched: [1:0.50] -; BTVER2-NEXT: movl $64, %ecx # sched: [1:0.50] -; BTVER2-NEXT: subl %edx, %ecx # sched: [1:0.50] -; BTVER2-NEXT: # kill: def $cl killed $cl killed $ecx -; BTVER2-NEXT: shrq %cl, %rsi # sched: [1:0.50] -; BTVER2-NEXT: orq %rdi, %rsi # sched: [1:0.50] -; BTVER2-NEXT: movq %rsi, %rax # sched: [1:0.50] +; BTVER2-NEXT: shldq %cl, %rsi, %rdi # sched: [4:4.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: lshift_cl: @@ -237,13 +229,8 @@ ; BTVER2-LABEL: rshift_cl: ; BTVER2: # %bb.0: # %entry ; BTVER2-NEXT: movl %edx, %ecx # sched: [1:0.50] -; BTVER2-NEXT: shrq %cl, %rdi # sched: [1:0.50] -; BTVER2-NEXT: movl $64, %ecx # sched: [1:0.50] -; BTVER2-NEXT: subl %edx, %ecx # sched: [1:0.50] -; BTVER2-NEXT: # kill: def $cl killed $cl killed $ecx -; BTVER2-NEXT: shlq %cl, %rsi # sched: [1:0.50] -; BTVER2-NEXT: orq %rdi, %rsi # sched: [1:0.50] -; BTVER2-NEXT: movq %rsi, %rax # sched: [1:0.50] +; BTVER2-NEXT: shrdq %cl, %rsi, %rdi # sched: [4:4.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: rshift_cl: @@ -309,15 +296,8 @@ ; ; BTVER2-LABEL: lshift_mem_cl: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] ; BTVER2-NEXT: movl %esi, %ecx # sched: [1:0.50] -; BTVER2-NEXT: shlq %cl, %rax # sched: [1:0.50] -; BTVER2-NEXT: movl $64, %ecx # sched: [1:0.50] -; BTVER2-NEXT: subl %esi, %ecx # sched: [1:0.50] -; BTVER2-NEXT: # kill: def $cl killed $cl killed $ecx -; BTVER2-NEXT: shrq %cl, %rdi # sched: [1:0.50] -; BTVER2-NEXT: orq %rax, %rdi # sched: [1:0.50] -; BTVER2-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; BTVER2-NEXT: shldq %cl, %rdi, {{.*}}(%rip) # sched: [9:11.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: lshift_mem_cl: @@ -350,11 +330,7 @@ ; ; BTVER2-LABEL: lshift_mem: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] -; BTVER2-NEXT: shrq $54, %rdi # sched: [1:0.50] -; BTVER2-NEXT: shlq $10, %rax # sched: [1:0.50] -; BTVER2-NEXT: orq %rax, %rdi # sched: [1:0.50] -; BTVER2-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; BTVER2-NEXT: shldq $10, %rdi, {{.*}}(%rip) # sched: [9:11.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: lshift_mem: @@ -402,7 +378,9 @@ ; GENERIC-LABEL: lshift_mem_b: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] -; GENERIC-NEXT: shrdq $54, %rdi, %rax # sched: [2:0.67] +; GENERIC-NEXT: shlq $10, %rdi # sched: [1:0.50] +; GENERIC-NEXT: shrq $54, %rax # sched: [1:0.50] +; GENERIC-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] ; GENERIC-NEXT: movq %rax, {{.*}}(%rip) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -411,7 +389,7 @@ ; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] ; BTVER2-NEXT: shlq $10, %rdi # sched: [1:0.50] ; BTVER2-NEXT: shrq $54, %rax # sched: [1:0.50] -; BTVER2-NEXT: orq %rdi, %rax # sched: [1:0.50] +; BTVER2-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] ; BTVER2-NEXT: movq %rax, {{.*}}(%rip) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; @@ -423,6 +401,7 @@ ; BDVER1-NEXT: orq %rdi, %rax ; BDVER1-NEXT: movq %rax, {{.*}}(%rip) ; BDVER1-NEXT: retq + entry: %a = load i64, i64* @x %shl = shl i64 %b, 10 @@ -432,18 +411,24 @@ ret void } +; uint64_t lshift10(uint64_t a, uint64_t b) + define void @lshift_mem_b_optsize(i64 %b) nounwind readnone optsize { ; GENERIC-LABEL: lshift_mem_b_optsize: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] -; GENERIC-NEXT: shrdq $54, %rdi, %rax # sched: [2:0.67] +; GENERIC-NEXT: shlq $10, %rdi # sched: [1:0.50] +; GENERIC-NEXT: shrq $54, %rax # sched: [1:0.50] +; GENERIC-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] ; GENERIC-NEXT: movq %rax, {{.*}}(%rip) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; BTVER2-LABEL: lshift_mem_b_optsize: ; BTVER2: # %bb.0: # %entry ; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] -; BTVER2-NEXT: shrdq $54, %rdi, %rax # sched: [3:3.00] +; BTVER2-NEXT: shlq $10, %rdi # sched: [1:0.50] +; BTVER2-NEXT: shrq $54, %rax # sched: [1:0.50] +; BTVER2-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] ; BTVER2-NEXT: movq %rax, {{.*}}(%rip) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ;