Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -73,7 +73,8 @@ FMLSv2f32_OP2, FMLSv2f64_OP2, FMLSv4i32_indexed_OP2, - FMLSv4f32_OP2 + FMLSv4f32_OP2, + SHLD2SHIFTOR }; } // end namespace llvm Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -42,6 +42,7 @@ namespace { class MachineCombiner : public MachineFunctionPass { + const TargetSubtargetInfo *STI; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MCSchedModel SchedModel; @@ -139,11 +140,11 @@ // For each instruction in the new sequence compute the depth based on the // operands. Use the trace information when possible. For new operands which // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth + DEBUG(dbgs() << " Could be replaced with\n"); for (auto *InstrPtr : InsInstrs) { // for each Use unsigned IDepth = 0; - DEBUG(dbgs() << "NEW INSTR "; - InstrPtr->print(dbgs(), TII); - dbgs() << "\n";); + DEBUG(dbgs() << "\t" << STI->getSchedInfoStr(*InstrPtr) << ": "; + InstrPtr->print(dbgs(), false, false, TII);); for (const MachineOperand &MO : InstrPtr->operands()) { // Check for virtual register operand. if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) @@ -163,8 +164,14 @@ DepthOp = InstrDepth[II->second]; int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); - LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, - InstrPtr, UseIdx); + if (DefIdx < 0 || UseIdx < 0) + // W/o def/use indexes we can't compute latency based on shed model + // that's why we're forced to use the default value + LatencyOp = TII->defaultDefLatency(SchedModel, *DefInstr); + else + LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, + InstrPtr, UseIdx); + } else { MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { @@ -266,17 +273,20 @@ unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; - DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n"; - dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; - dbgs() << " RootDepth: " << RootDepth << "\n"); + DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " + << NewRootDepth << "\tRootDepth: " << RootDepth); // For a transform such as reassociation, the cost equation is // conservatively calculated so that we must improve the depth (data // dependency cycles) in the critical path to proceed with the transform. // Being conservative also protects against inaccuracies in the underlying // machine trace metrics and CPU models. - if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) + if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) { + DEBUG(dbgs() << "\tIt MustReduceDepth "); + DEBUG(NewRootDepth < RootDepth ? dbgs() << "\t and it does it\n" + : dbgs() << "\t but it does NOT do it\n"); return NewRootDepth < RootDepth; + } // A more flexible cost calculation for the critical path includes the slack // of the original code sequence. This may allow the transform to proceed @@ -290,17 +300,17 @@ unsigned RootSlack = BlockTrace.getInstrSlack(*Root); unsigned NewCycleCount = NewRootDepth + NewRootLatency; - unsigned OldCycleCount = RootDepth + RootLatency + - (SlackIsAccurate ? RootSlack : 0); - DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; - dbgs() << " RootLatency: " << RootLatency << "\n"; - dbgs() << " RootSlack: " << RootSlack << " SlackIsAccurate=" - << SlackIsAccurate << "\n"; - dbgs() << " NewRootDepth + NewRootLatency = " - << NewCycleCount << "\n"; - dbgs() << " RootDepth + RootLatency + RootSlack = " - << OldCycleCount << "\n"; - ); + unsigned OldCycleCount = + RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0); + DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency << "\tRootLatency: " + << RootLatency << "\n\tRootSlack: " << RootSlack + << " SlackIsAccurate=" << SlackIsAccurate + << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount + << "\n\tRootDepth + RootLatency + RootSlack = " + << OldCycleCount;); + DEBUG(NewCycleCount <= OldCycleCount + ? dbgs() << "\n\t It IMPROVES PathLen because\n" + : dbgs() << "\n\t It DOES NOT improve PathLen because\n"); return NewCycleCount <= OldCycleCount; } @@ -346,9 +356,11 @@ unsigned ResLenAfterCombine = BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); - DEBUG(dbgs() << "RESOURCE DATA: \n"; - dbgs() << " resource len before: " << ResLenBeforeCombine - << " after: " << ResLenAfterCombine << "\n";); + DEBUG(dbgs() << "\t\tResource length before replacement: " << ResLenBeforeCombine + << " and after: " << ResLenAfterCombine << "\n";); + DEBUG(ResLenAfterCombine <= ResLenBeforeCombine + ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n" + : dbgs() << "\t\t As result it DOES NOT improves/preserve Resource Length\n"); return ResLenAfterCombine <= ResLenBeforeCombine; } @@ -427,9 +439,8 @@ while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; - - DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); SmallVector Patterns; + // The motivating example is: // // MUL Other MUL_op1 MUL_op2 Other @@ -458,6 +469,10 @@ if (!TII->getMachineCombinerPatterns(MI, Patterns)) continue; + DEBUG(dbgs() << " Possible instr(s) to replace\n"); + DEBUG(dbgs() << "\t" << STI->getSchedInfoStr(MI) << ": "; + MI.print(dbgs(), false, false, TII);); + for (auto P : Patterns) { SmallVector InsInstrs; SmallVector DelInstrs; @@ -510,6 +525,17 @@ LastUpdate = BlockIter; } + DEBUG(dbgs() << "\tDelInstrs\n"); + for (auto *InstrPtr : DelInstrs) { + DEBUG(dbgs() << "\t\t" << STI->getSchedInfoStr(*InstrPtr) << ": "; + InstrPtr->print(dbgs(), false, false, TII);); + } + DEBUG(dbgs() << "\tInsInstrs\n"); + for (auto *InstrPtr : InsInstrs) { + DEBUG(dbgs() << "\t\t" << STI->getSchedInfoStr(*InstrPtr) << ": "; + InstrPtr->print(dbgs(), false, false, TII);); + } + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, RegUnits, IncrementalUpdate); @@ -533,11 +559,11 @@ } bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { - const TargetSubtargetInfo &STI = MF.getSubtarget(); - TII = STI.getInstrInfo(); - TRI = STI.getRegisterInfo(); - SchedModel = STI.getSchedModel(); - TSchedModel.init(SchedModel, &STI, TII); + STI = &MF.getSubtarget(); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + SchedModel = STI->getSchedModel(); + TSchedModel.init(SchedModel, STI, TII); MRI = &MF.getRegInfo(); MLI = &getAnalysis(); Traces = &getAnalysis(); Index: lib/CodeGen/MachineInstr.cpp =================================================================== --- lib/CodeGen/MachineInstr.cpp +++ lib/CodeGen/MachineInstr.cpp @@ -1899,8 +1899,11 @@ const TargetInstrInfo *TII) const { const Module *M = nullptr; if (const MachineBasicBlock *MBB = getParent()) - if (const MachineFunction *MF = MBB->getParent()) + if (const MachineFunction *MF = MBB->getParent()) { M = MF->getFunction()->getParent(); + if (!TII) + TII = MF->getSubtarget().getInstrInfo(); + } ModuleSlotTracker MST(M); print(OS, MST, SkipOpers, SkipDebugLoc, TII); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -33305,8 +33305,8 @@ // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. - if (!OptForSize && Subtarget.isSHLDSlow()) - return SDValue(); + // if (!OptForSize && Subtarget.isSHLDSlow()) + // return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -517,6 +517,21 @@ bool useMachineCombiner() const override { return true; } + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// listed in the array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; bool hasReassociableOperands(const MachineInstr &Inst, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -10946,3 +10946,182 @@ return It; } + +// We try to replace +// shldq $12, %rsi, %rdi # sched: [3:3.00] +// with +// shlq $12, %rdi # sched: [1:0.50] +// shrq $52, %rsi # sched: [1:0.50] +// leaq (%rsi,%rdi), %rax # sched: [1:0.50] +// iff the last is faster. +// TODO: SHLD could require additional move like +// movq %rdi, %rax # sched: [1:0.17] +// but that's another story. + +static void genShiftsLea(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs) { + + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + int8_t bits; + switch (Instrs[0]) { + default: + return; // Invalid instruction + case X86::SHL16ri: + bits = 16; + break; + case X86::SHL32ri: + bits = 32; + break; + case X86::SHL64ri: + bits = 64; + break; + } + + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned DestReg = Root.getOperand(1).getReg(); + unsigned SrcReg = Root.getOperand(2).getReg(); + int64_t Imm8 = Root.getOperand(3).getImm(); + + // Left shift could be replaced with scale factor in case if + // shift value is equal to 1, 2 or 4 + unsigned ShlVReg = DestReg; + unsigned Scale = 1; + switch (Imm8) { + case 1: + Scale = 2; + break; + case 2: + Scale = 4; + break; + case 4: + Scale = 8; + break; + default: + ShlVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(ShlVReg, 0)); + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), ShlVReg) + .addReg(DestReg) + .addImm(Imm8); + InsInstrs.push_back(ShlMI); + } + + unsigned ShrVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(ShrVReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), ShrVReg) + .addReg(SrcReg) + .addImm(bits - Imm8); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder LeaMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2]), ResultReg) + .addReg(ShrVReg) + .addImm(Scale) + .addReg(ShlVReg) + .addImm(0) // Disp + .addReg(0); // SegReg + InsInstrs.push_back(LeaMI); + + DelInstrs.push_back(&Root); // Record SHLD/SHRD for deletion +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { +// MachineBasicBlock &MBB = *Root.getParent(); +// MachineFunction &MF = *MBB.getParent(); + + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::SHLD2SHIFTOR: + switch (Root.getOpcode()) { + default: + return; + case X86::SHLD16mrCL: + case X86::SHLD32mrCL: + case X86::SHLD64mrCL: + case X86::SHLD16mri8: + case X86::SHLD32mri8: + case X86::SHLD64mri8: + return; + case X86::SHLD16rrCL: + case X86::SHLD32rrCL: + case X86::SHLD64rrCL: + return; + case X86::SHLD16rri8: + return; + genShiftsLea(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16ri, X86::SHR16ri, X86::LEA16r}); + break; + case X86::SHLD32rri8: + return; + genShiftsLea(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32ri, X86::SHR32ri, X86::LEA32r}); + break; + case X86::SHLD64rri8: + genShiftsLea(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64ri, X86::SHR64ri, X86::LEA64r}); + break; + } + break; + } +} + +/// Find SHLD/SHRD instructions +static bool getSHLDPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + switch (Root.getOpcode()) { + default: + return false; + case X86::SHLD16mrCL: + case X86::SHLD32mrCL: + case X86::SHLD64mrCL: + case X86::SHLD16mri8: + case X86::SHLD32mri8: + case X86::SHLD64mri8: + case X86::SHLD16rrCL: + case X86::SHLD32rrCL: + case X86::SHLD64rrCL: + case X86::SHLD16rri8: + case X86::SHLD32rri8: + break; + case X86::SHLD64rri8: + break; + } + Patterns.push_back(MachineCombinerPattern::SHLD2SHIFTOR); + return true; +} + +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + // FDIV patterns + if (getSHLDPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} Index: test/CodeGen/X86/2008-07-11-SHLBy1.ll =================================================================== --- test/CodeGen/X86/2008-07-11-SHLBy1.ll +++ test/CodeGen/X86/2008-07-11-SHLBy1.ll @@ -1,5 +1,28 @@ -; RUN: llc < %s -mtriple=x86_64-- -o - | not grep shr +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -o - -mcpu=skylake -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=SKL +; RUN: llc < %s -mtriple=x86_64-- -o - -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-- -o - -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL define i128 @sl(i128 %x) { +; SKL-LABEL: sl: +; SKL: # BB#0: +; SKL-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] +; SKL-NEXT: shrq $63, %rdi # sched: [1:0.50] +; SKL-NEXT: leaq (%rdi,%rsi,2), %rdx # sched: [1:0.50] +; SKL-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: sl: +; BTVER2: # BB#0: +; BTVER2-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: shrq $63, %rdi # sched: [1:0.50] +; BTVER2-NEXT: leaq (%rdi,%rsi,2), %rdx # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; HASWELL-LABEL: sl: +; HASWELL: # BB#0: +; HASWELL-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] +; HASWELL-NEXT: shrq $63, %rdi # sched: [1:0.50] +; HASWELL-NEXT: leaq (%rdi,%rsi,2), %rdx # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] %t = shl i128 %x, 1 ret i128 %t } Index: test/CodeGen/X86/known-bits.ll =================================================================== --- test/CodeGen/X86/known-bits.ll +++ test/CodeGen/X86/known-bits.ll @@ -152,7 +152,9 @@ ; X64-NEXT: andq $-1024, %rsi # imm = 0xFC00 ; X64-NEXT: addq %rdi, %rsi ; X64-NEXT: adcl $0, %edx -; X64-NEXT: shldq $54, %rsi, %rdx +; X64-NEXT: shlq $54, %rdx +; X64-NEXT: shrq $10, %rsi +; X64-NEXT: leaq (%rsi,%rdx), %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: retq %1 = and i64 %a0, -1024 Index: test/CodeGen/X86/rot64.ll =================================================================== --- test/CodeGen/X86/rot64.ll +++ test/CodeGen/X86/rot64.ll @@ -113,11 +113,24 @@ } define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone { -; ALL-LABEL: xbar: -; ALL: # BB#0: # %entry -; ALL-NEXT: shrdq $57, %rsi, %rdi -; ALL-NEXT: movq %rdi, %rax -; ALL-NEXT: retq +; X64-LABEL: xbar: +; X64: # BB#0: # %entry +; X64-NEXT: shrdq $57, %rsi, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +; +; SHLD-LABEL: xbar: +; SHLD: # BB#0: # %entry +; SHLD-NEXT: shrdq $57, %rsi, %rdi +; SHLD-NEXT: movq %rdi, %rax +; SHLD-NEXT: retq +; +; BMI2-LABEL: xbar: +; BMI2: # BB#0: # %entry +; BMI2-NEXT: shlq $7, %rsi +; BMI2-NEXT: shrq $57, %rdi +; BMI2-NEXT: leaq (%rdi,%rsi), %rax +; BMI2-NEXT: retq entry: %0 = shl i64 %y, 7 %1 = lshr i64 %x, 57 @@ -175,11 +188,24 @@ } define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone { -; ALL-LABEL: xbu: -; ALL: # BB#0: # %entry -; ALL-NEXT: shldq $57, %rsi, %rdi -; ALL-NEXT: movq %rdi, %rax -; ALL-NEXT: retq +; X64-LABEL: xbu: +; X64: # BB#0: # %entry +; X64-NEXT: shldq $57, %rsi, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +; +; SHLD-LABEL: xbu: +; SHLD: # BB#0: # %entry +; SHLD-NEXT: shldq $57, %rsi, %rdi +; SHLD-NEXT: movq %rdi, %rax +; SHLD-NEXT: retq +; +; BMI2-LABEL: xbu: +; BMI2: # BB#0: # %entry +; BMI2-NEXT: shlq $57, %rdi +; BMI2-NEXT: shrq $7, %rsi +; BMI2-NEXT: leaq (%rsi,%rdi), %rax +; BMI2-NEXT: retq entry: %0 = lshr i64 %y, 7 %1 = shl i64 %x, 57 Index: test/CodeGen/X86/x86-64-double-precision-shift-right.ll =================================================================== --- test/CodeGen/X86/x86-64-double-precision-shift-right.ll +++ test/CodeGen/X86/x86-64-double-precision-shift-right.ll @@ -1,6 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 | FileCheck %s ; Verify that for the architectures that are known to have poor latency -; double precision shift instructions we generate alternative sequence +; double precision shift instructions we generate alternative sequence ; of instructions with lower latencies instead of shrd instruction. ;uint64_t rshift1(uint64_t a, uint64_t b) @@ -14,6 +15,12 @@ ; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} define i64 @rshift1(i64 %a, i64 %b) nounwind readnone uwtable { +; CHECK-LABEL: rshift1: +; CHECK: # BB#0: +; CHECK-NEXT: shlq $63, %rsi +; CHECK-NEXT: shrq $1, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax +; CHECK-NEXT: retq %1 = lshr i64 %a, 1 %2 = shl i64 %b, 63 %3 = or i64 %2, %1 @@ -32,6 +39,12 @@ define i64 @rshift2(i64 %a, i64 %b) nounwind readnone uwtable { +; CHECK-LABEL: rshift2: +; CHECK: # BB#0: +; CHECK-NEXT: shlq $62, %rsi +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax +; CHECK-NEXT: retq %1 = lshr i64 %a, 2 %2 = shl i64 %b, 62 %3 = or i64 %2, %1 @@ -50,6 +63,12 @@ define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable { +; CHECK-LABEL: rshift7: +; CHECK: # BB#0: +; CHECK-NEXT: shlq $57, %rsi +; CHECK-NEXT: shrq $7, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax +; CHECK-NEXT: retq %1 = lshr i64 %a, 7 %2 = shl i64 %b, 57 %3 = or i64 %2, %1 @@ -66,6 +85,11 @@ ; CHECK-NEXT: leaq (%rdi,%rsi,2), %rax define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable { +; CHECK-LABEL: rshift63: +; CHECK: # BB#0: +; CHECK-NEXT: shrq $63, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi,2), %rax +; CHECK-NEXT: retq %1 = lshr i64 %a, 63 %2 = shl i64 %b, 1 %3 = or i64 %2, %1 Index: test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll =================================================================== --- test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll +++ test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll @@ -1,8 +1,9 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s ; clang -Oz -c test1.cpp -emit-llvm -S -o ; Verify that we generate shld insruction when we are optimizing for size, -; even for X86_64 processors that are known to have poor latency double +; even for X86_64 processors that are known to have poor latency double ; precision shift instructions. ; uint64_t lshift10(uint64_t a, uint64_t b) ; { @@ -11,8 +12,13 @@ ; Function Attrs: minsize nounwind readnone uwtable define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 { +; CHECK-LABEL: _Z8lshift10mm: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: shlq $10, %rdi +; CHECK-NEXT: shrq $54, %rsi +; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: retq entry: -; CHECK: shldq $10 %shl = shl i64 %a, 10 %shr = lshr i64 %b, 54 %or = or i64 %shr, %shl @@ -33,8 +39,13 @@ ; Function Attrs: nounwind optsize readnone uwtable define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 { +; CHECK-LABEL: _Z8lshift11mm: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: shlq $11, %rdi +; CHECK-NEXT: shrq $53, %rsi +; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: retq entry: -; CHECK: shldq $11 %shl = shl i64 %a, 11 %shr = lshr i64 %b, 53 %or = or i64 %shr, %shl @@ -54,9 +65,13 @@ ; Function Attrs: nounwind optsize readnone uwtable define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 { +; CHECK-LABEL: _Z8lshift12mm: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: shlq $12, %rdi +; CHECK-NEXT: shrq $52, %rsi +; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: retq entry: -; CHECK: shlq $12 -; CHECK-NEXT: shrq $52 %shl = shl i64 %a, 12 %shr = lshr i64 %b, 52 %or = or i64 %shr, %shl Index: test/CodeGen/X86/x86-64-double-shifts-var.ll =================================================================== --- test/CodeGen/X86/x86-64-double-shifts-var.ll +++ test/CodeGen/X86/x86-64-double-shifts-var.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon-tbird | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon-4 | FileCheck %s @@ -19,7 +20,7 @@ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s -; Verify that for the X86_64 processors that are known to have poor latency +; Verify that for the X86_64 processors that are known to have poor latency ; double precision shift instructions we do not generate 'shld' or 'shrd' ; instructions. @@ -29,8 +30,13 @@ ;} define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone { +; CHECK-LABEL: lshift: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shldq %cl, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq entry: -; CHECK-NOT: shld %sh_prom = zext i32 %c to i64 %shl = shl i64 %a, %sh_prom %sub = sub nsw i32 64, %c @@ -46,8 +52,13 @@ ;} define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone { +; CHECK-LABEL: rshift: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shrdq %cl, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq entry: -; CHECK-NOT: shrd %sh_prom = zext i32 %c to i64 %shr = lshr i64 %a, %sh_prom %sub = sub nsw i32 64, %c