Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -79,7 +79,8 @@ FMLSv4f32_OP1, FMLSv4f32_OP2, FMLSv4i32_indexed_OP1, - FMLSv4i32_indexed_OP2 + FMLSv4i32_indexed_OP2, + SHLD2SHIFTS }; } // end namespace llvm Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -162,8 +162,14 @@ DepthOp = InstrDepth[II->second]; int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); - LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, - InstrPtr, UseIdx); + if (DefIdx < 0 || UseIdx < 0) + // W/o def/use indexes we can't compute latency based on shed model + // that's why we're forced to use the default value + LatencyOp = TII->defaultDefLatency(SchedModel, *DefInstr); + else + LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, + InstrPtr, UseIdx); + } else { MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { @@ -364,6 +370,8 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { if (OptSize && (NewSize < OldSize)) return true; + if (OptSize && (NewSize > OldSize)) + return false; if (!TSchedModel.hasInstrSchedModelOrItineraries()) return true; return false; @@ -498,7 +506,7 @@ // Eagerly stop after the first pattern fires. Changed = true; break; - } else { + } else if (!(OptSize && (NewInstCount > OldInstCount))) { // For big basic blocks, we only compute the full trace the first time // we hit this. We do not invalidate the trace, but instead update the // instruction depths incrementally. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -33790,7 +33790,9 @@ // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. - if (!OptForSize && Subtarget.isSHLDSlow()) + // TODO: for 64-bit we'll generate SHLD code + if (!OptForSize && Subtarget.isSHLDSlow() && + !(Subtarget.is64Bit() && Subtarget.getInstrInfo()->isMCShldEnabled())) return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -519,7 +519,23 @@ const MachineInstr &UseMI, unsigned UseIdx) const override; - bool useMachineCombiner() const override { return true; } + bool useMachineCombiner() const override; + bool isMCShldEnabled() const; + + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// listed in the array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const override; bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -70,6 +70,9 @@ cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden); +static cl::opt + MCShldEnabled("mc-shld-enabled", cl::Hidden, cl::init(false), + cl::desc("Enable Machine Combiner SHLD substitution")); enum { // Select which memory operand is being unfolded. @@ -3800,6 +3803,8 @@ } } } + MCShldEnabled = + MCShldEnabled & Subtarget.getSchedModel().hasInstrSchedModel(); } void @@ -11221,3 +11226,317 @@ return It; } + +bool X86InstrInfo::isMCShldEnabled() const { return MCShldEnabled; } + +static void genShldCl(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned DestReg = Root.getOperand(1).getReg(); + unsigned SrcReg = Root.getOperand(2).getReg(); + unsigned CLReg = Root.getOperand(4).getReg(); + assert(CLReg == X86::CL && "It must be CL register!!!"); + + bool isDead = Root.getOperand(0).isDead(); + bool isKill = Root.getOperand(1).isKill(); + + unsigned VShlReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShlReg, 0)); + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), VShlReg) + .addReg(DestReg, getKillRegState(isKill)); + InsInstrs.push_back(ShlMI); + + unsigned MovReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(MovReg, 0)); + MachineInstrBuilder MovMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1])) + .addReg(MovReg, RegState::Define) + .addImm(64); + InsInstrs.push_back(MovMI); + + unsigned SubReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(SubReg, 0)); + MachineInstrBuilder SubMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2])) + .addReg(SubReg, RegState::Define) + .addReg(MovReg, RegState::Kill) + .addReg(CLReg, RegState::Kill); + InsInstrs.push_back(SubMI); + + MachineInstrBuilder MovMI2 = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[3])) + .addReg(X86::RCX, RegState::Define) + .addReg(SubReg); + InsInstrs.push_back(MovMI2); + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[4]), VShrReg) + .addReg(SrcReg, RegState::Kill); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder OrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5])) + .addReg(ResultReg, RegState::Define | getDeadRegState(isDead)) + .addReg(VShrReg, RegState::Kill) + .addReg(VShlReg, RegState::Kill); + InsInstrs.push_back(OrMI); +} + +static void genShldMri(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, int8_t NBits) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned BaseReg = Root.getOperand(0).getReg(); + // TODO: at the moment we support PC-relative addressing mode only. + assert(BaseReg == X86::RIP || BaseReg == X86::EIP || BaseReg == X86::IP); + unsigned Scale = Root.getOperand(1).getImm(); + unsigned IndexReg = Root.getOperand(2).getReg(); + MachineOperand MO = Root.getOperand(3); + + // TODO: we should support all possible types of Disp + const GlobalValue *GlobalDisp = nullptr; + if (MO.isGlobal()) + GlobalDisp = MO.getGlobal(); + else + return; // TODO: add other possible mem operands + + unsigned SegReg = Root.getOperand(4).getReg(); + unsigned SrcReg = Root.getOperand(5).getReg(); + unsigned Shift = Root.getOperand(6).getImm(); + + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0])) + .addReg(BaseReg) + .addImm(Scale) + .addReg(IndexReg) + .addGlobalAddress(GlobalDisp) + .addReg(SegReg) + .addImm(Shift); + InsInstrs.push_back(ShlMI); + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), VShrReg) + .addReg(SrcReg) + .addImm(NBits - Shift); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder OrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2])) + .addReg(BaseReg) + .addImm(Scale) + .addReg(IndexReg) + .addGlobalAddress(GlobalDisp) + .addReg(SegReg) + .addReg(VShrReg); + InsInstrs.push_back(OrMI); +} +// We try to replace +// shldq $12, %rsi, %rdi # sched: [3:3.00] +// with +// shlq $12, %rdi # sched: [1:0.50] +// shrq $52, %rsi # sched: [1:0.50] +// leaq (%rsi,%rdi), %rax # sched: [1:0.50] +// iff the last is faster. + +static void genShldRri(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, int8_t NBits) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned DestReg = Root.getOperand(1).getReg(); + unsigned SrcReg = Root.getOperand(2).getReg(); + + // FIXME: Is it possible to zero here? + if (int64_t Imm8 = Root.getOperand(3).getImm()) { + // Left shift could be replaced with scale factor in case if + // shift value is equal to 1, 2 or 4 + unsigned VShlReg = DestReg; + unsigned Scale = 1; + switch (Imm8) { + case 1: + Scale = 2; + break; + case 2: + Scale = 4; + break; + case 4: + Scale = 8; + break; + default: + VShlReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShlReg, 0)); + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), VShlReg) + .addReg(DestReg) + .addImm(Imm8); + InsInstrs.push_back(ShlMI); + } + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), VShrReg) + .addReg(SrcReg) + .addImm(NBits - Imm8); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder LeaMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2]), ResultReg) + .addReg(VShrReg) + .addImm(Scale) + .addReg(VShlReg) + .addImm(0) // Disp + .addReg(0); // SegReg + InsInstrs.push_back(LeaMI); + } +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::SHLD2SHIFTS: + switch (Root.getOpcode()) { + default: + return; + // TODO: do we need sequences for SHLD16mrCL, SHLD32mrCL, SHLD64mrCL? + case X86::SHLD16mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16mi, X86::SHR16ri, X86::OR16mr}, 16); + break; + case X86::SHLD16rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16rCL, X86::MOV16ri, X86::SUB16rr, X86::MOV16ri, + X86::SHR16rCL, X86::OR16rr}); + break; + case X86::SHLD16rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16ri, X86::SHR16ri, X86::LEA16r}, 16); + break; + case X86::SHLD32mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32mi, X86::SHR32ri, X86::OR32mr}, 32); + break; + case X86::SHLD32rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32rCL, X86::MOV32ri, X86::SUB32rr, X86::MOV32ri, + X86::SHR32rCL, X86::OR32rr}); + break; + case X86::SHLD32rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32ri, X86::SHR32ri, X86::LEA32r}, 32); + break; + case X86::SHLD64mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64mi, X86::SHR64ri, X86::OR64mr}, 64); + break; + case X86::SHLD64rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64rCL, X86::MOV64ri, X86::SUB64rr, X86::MOV64ri, + X86::SHR64rCL, X86::OR64rr}); + break; + case X86::SHLD64rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64ri, X86::SHR64ri, X86::LEA64r}, 64); + break; + } + DelInstrs.push_back(&Root); // Record SHLD/SHRD for deletion + break; + } +} + +/// Find SHLD/SHRD instructions +static bool getSHLDPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + switch (Root.getOpcode()) { + default: + return false; + // TODO: do we need sequences for SHLD(XX)mrCL ? + // It seems they are too long: + // movq x(%rip), %rax # sched: [5:1.00] + // movl %esi, %ecx # sched: [1:0.50] + // shlq %cl, %rax # sched: [1:0.50] + // movl $64, %ecx # sched: [1:0.50] + // subl %esi, %ecx # sched: [1:0.50] + // shrq %cl, %rdi # sched: [1:0.50] + // orq %rax, %rdi # sched: [1:0.50] + // movq %rdi, x(%rip) # sched: [1:1.00] + // TODO: At the moment we support 64-bit only + // case X86::SHLD16mri8: + // case X86::SHLD16rrCL: + // case X86::SHLD16rri8: + // case X86::SHLD32mri8: + // case X86::SHLD32rrCL: + // case X86::SHLD32rri8: + case X86::SHLD64mri8: + case X86::SHLD64rrCL: + case X86::SHLD64rri8: + break; + } + Patterns.push_back(MachineCombinerPattern::SHLD2SHIFTS); + return true; +} + +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + // SHLD patterns + if (MCShldEnabled && getSHLDPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + +bool X86InstrInfo::useMachineCombiner() const { + // return Subtarget.getSchedModel().hasInstrSchedModel(); + return true; +} Index: test/CodeGen/X86/schedule-x86-64-shld.ll =================================================================== --- test/CodeGen/X86/schedule-x86-64-shld.ll +++ test/CodeGen/X86/schedule-x86-64-shld.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mc-shld-enabled -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mc-shld-enabled -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mc-shld-enabled -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER1 ; uint64_t lshift10(uint64_t a, uint64_t b) @@ -43,9 +43,8 @@ ; ; BTVER2-LABEL: lshift10: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: shlq $10, %rdi # sched: [1:0.50] -; BTVER2-NEXT: shrq $54, %rsi # sched: [1:0.50] -; BTVER2-NEXT: leaq (%rsi,%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: shldq $10, %rsi, %rdi # sched: [3:3.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: lshift10: @@ -102,8 +101,8 @@ ; ; BTVER2-LABEL: rshift10: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: shrq $62, %rdi # sched: [1:0.50] -; BTVER2-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50] +; BTVER2-NEXT: shrdq $62, %rsi, %rdi # sched: [3:3.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: rshift10: @@ -163,13 +162,8 @@ ; BTVER2-LABEL: lshift_cl: ; BTVER2: # %bb.0: # %entry ; BTVER2-NEXT: movl %edx, %ecx # sched: [1:0.50] -; BTVER2-NEXT: shlq %cl, %rdi # sched: [1:0.50] -; BTVER2-NEXT: movl $64, %ecx # sched: [1:0.50] -; BTVER2-NEXT: subl %edx, %ecx # sched: [1:0.50] -; BTVER2-NEXT: # kill: def %cl killed %cl killed %ecx -; BTVER2-NEXT: shrq %cl, %rsi # sched: [1:0.50] -; BTVER2-NEXT: orq %rdi, %rsi # sched: [1:0.50] -; BTVER2-NEXT: movq %rsi, %rax # sched: [1:0.50] +; BTVER2-NEXT: shldq %cl, %rsi, %rdi # sched: [4:4.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: lshift_cl: @@ -237,13 +231,8 @@ ; BTVER2-LABEL: rshift_cl: ; BTVER2: # %bb.0: # %entry ; BTVER2-NEXT: movl %edx, %ecx # sched: [1:0.50] -; BTVER2-NEXT: shrq %cl, %rdi # sched: [1:0.50] -; BTVER2-NEXT: movl $64, %ecx # sched: [1:0.50] -; BTVER2-NEXT: subl %edx, %ecx # sched: [1:0.50] -; BTVER2-NEXT: # kill: def %cl killed %cl killed %ecx -; BTVER2-NEXT: shlq %cl, %rsi # sched: [1:0.50] -; BTVER2-NEXT: orq %rdi, %rsi # sched: [1:0.50] -; BTVER2-NEXT: movq %rsi, %rax # sched: [1:0.50] +; BTVER2-NEXT: shrdq %cl, %rsi, %rdi # sched: [4:4.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: rshift_cl: @@ -309,15 +298,8 @@ ; ; BTVER2-LABEL: lshift_mem_cl: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] ; BTVER2-NEXT: movl %esi, %ecx # sched: [1:0.50] -; BTVER2-NEXT: shlq %cl, %rax # sched: [1:0.50] -; BTVER2-NEXT: movl $64, %ecx # sched: [1:0.50] -; BTVER2-NEXT: subl %esi, %ecx # sched: [1:0.50] -; BTVER2-NEXT: # kill: def %cl killed %cl killed %ecx -; BTVER2-NEXT: shrq %cl, %rdi # sched: [1:0.50] -; BTVER2-NEXT: orq %rax, %rdi # sched: [1:0.50] -; BTVER2-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; BTVER2-NEXT: shldq %cl, %rdi, {{.*}}(%rip) # sched: [9:11.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: lshift_mem_cl: @@ -350,11 +332,7 @@ ; ; BTVER2-LABEL: lshift_mem: ; BTVER2: # %bb.0: # %entry -; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] -; BTVER2-NEXT: shrq $54, %rdi # sched: [1:0.50] -; BTVER2-NEXT: shlq $10, %rax # sched: [1:0.50] -; BTVER2-NEXT: orq %rax, %rdi # sched: [1:0.50] -; BTVER2-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; BTVER2-NEXT: shldq $10, %rdi, {{.*}}(%rip) # sched: [9:11.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; BDVER1-LABEL: lshift_mem: @@ -402,7 +380,9 @@ ; GENERIC-LABEL: lshift_mem_b: ; GENERIC: # %bb.0: # %entry ; GENERIC-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] -; GENERIC-NEXT: shrdq $54, %rdi, %rax # sched: [2:0.67] +; GENERIC-NEXT: shlq $10, %rdi # sched: [1:0.50] +; GENERIC-NEXT: shrq $54, %rax # sched: [1:0.50] +; GENERIC-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] ; GENERIC-NEXT: movq %rax, {{.*}}(%rip) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -411,7 +391,7 @@ ; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] ; BTVER2-NEXT: shlq $10, %rdi # sched: [1:0.50] ; BTVER2-NEXT: shrq $54, %rax # sched: [1:0.50] -; BTVER2-NEXT: orq %rdi, %rax # sched: [1:0.50] +; BTVER2-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] ; BTVER2-NEXT: movq %rax, {{.*}}(%rip) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ;