Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -79,7 +79,8 @@ FMLSv4f32_OP1, FMLSv4f32_OP2, FMLSv4i32_indexed_OP1, - FMLSv4i32_indexed_OP2 + FMLSv4i32_indexed_OP2, + SHLD2SHIFTS }; } // end namespace llvm Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -162,8 +162,14 @@ DepthOp = InstrDepth[II->second]; int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); - LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, - InstrPtr, UseIdx); + if (DefIdx < 0 || UseIdx < 0) + // W/o def/use indexes we can't compute latency based on shed model + // that's why we're forced to use the default value + LatencyOp = TII->defaultDefLatency(SchedModel, *DefInstr); + else + LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, + InstrPtr, UseIdx); + } else { MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { @@ -364,6 +370,8 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { if (OptSize && (NewSize < OldSize)) return true; + if (OptSize && (NewSize > OldSize)) + return false; if (!TSchedModel.hasInstrSchedModelOrItineraries()) return true; return false; @@ -498,7 +506,7 @@ // Eagerly stop after the first pattern fires. Changed = true; break; - } else { + } else if (!(OptSize && (NewInstCount > OldInstCount))) { // For big basic blocks, we only compute the full trace the first time // we hit this. We do not invalidate the trace, but instead update the // instruction depths incrementally. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -33725,7 +33725,8 @@ // series of shifts/or that would otherwise be generated. // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions // have higher latencies and we are not optimizing for size. - if (!OptForSize && Subtarget.isSHLDSlow()) + // TODO: for 64-bit we'll generate SHLD/SHRD code + if (!OptForSize && Subtarget.isSHLDSlow() && !Subtarget.is64Bit()) return SDValue(); if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -517,6 +517,21 @@ bool useMachineCombiner() const override { return true; } + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// listed in the array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; bool hasReassociableOperands(const MachineInstr &Inst, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -11030,3 +11030,310 @@ return It; } + +static void genShldCl(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned DestReg = Root.getOperand(1).getReg(); + unsigned SrcReg = Root.getOperand(2).getReg(); + unsigned CLReg = Root.getOperand(4).getReg(); + assert(CLReg == X86::CL && "It must be CL register!!!"); + + bool isDead = Root.getOperand(0).isDead(); + bool isKill = Root.getOperand(1).isKill(); + + unsigned VShlReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShlReg, 0)); + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), VShlReg) + .addReg(DestReg, getKillRegState(isKill)); + InsInstrs.push_back(ShlMI); + + unsigned MovReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(MovReg, 0)); + MachineInstrBuilder MovMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1])) + .addReg(MovReg, RegState::Define) + .addImm(64); + InsInstrs.push_back(MovMI); + + unsigned SubReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(SubReg, 0)); + MachineInstrBuilder SubMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2])) + .addReg(SubReg, RegState::Define) + .addReg(MovReg, RegState::Kill) + .addReg(CLReg, RegState::Kill); + InsInstrs.push_back(SubMI); + + MachineInstrBuilder MovMI2 = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[3])) + .addReg(X86::RCX, RegState::Define) + .addReg(SubReg); + InsInstrs.push_back(MovMI2); + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[4]), VShrReg) + .addReg(SrcReg, RegState::Kill); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder OrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5])) + .addReg(ResultReg, RegState::Define | getDeadRegState(isDead)) + .addReg(VShrReg, RegState::Kill) + .addReg(VShlReg, RegState::Kill); + InsInstrs.push_back(OrMI); +} + +static void genShldMri(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, int8_t NBits) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned BaseReg = Root.getOperand(0).getReg(); + // TODO: at the moment we support PC-relative addressing mode only. + assert(BaseReg == X86::RIP || BaseReg == X86::EIP || BaseReg == X86::IP); + unsigned Scale = Root.getOperand(1).getImm(); + unsigned IndexReg = Root.getOperand(2).getReg(); + MachineOperand MO = Root.getOperand(3); + + // TODO: we should support all possible types of Disp + const GlobalValue *GlobalDisp = nullptr; + if (MO.isGlobal()) + GlobalDisp = MO.getGlobal(); + else + return; // TODO: add other possible mem operands + + unsigned SegReg = Root.getOperand(4).getReg(); + unsigned SrcReg = Root.getOperand(5).getReg(); + unsigned Shift = Root.getOperand(6).getImm(); + + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0])) + .addReg(BaseReg) + .addImm(Scale) + .addReg(IndexReg) + .addGlobalAddress(GlobalDisp) + .addReg(SegReg) + .addImm(Shift); + InsInstrs.push_back(ShlMI); + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), VShrReg) + .addReg(SrcReg) + .addImm(NBits - Shift); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder OrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2])) + .addReg(BaseReg) + .addImm(Scale) + .addReg(IndexReg) + .addGlobalAddress(GlobalDisp) + .addReg(SegReg) + .addReg(VShrReg); + InsInstrs.push_back(OrMI); +} +// We try to replace +// shldq $12, %rsi, %rdi # sched: [3:3.00] +// with +// shlq $12, %rdi # sched: [1:0.50] +// shrq $52, %rsi # sched: [1:0.50] +// leaq (%rsi,%rdi), %rax # sched: [1:0.50] +// iff the last is faster. + +static void genShldRri(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, int8_t NBits) { + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned DestReg = Root.getOperand(1).getReg(); + unsigned SrcReg = Root.getOperand(2).getReg(); + + // FIXME: Is it possible to zero here? + if (int64_t Imm8 = Root.getOperand(3).getImm()) { + // Left shift could be replaced with scale factor in case if + // shift value is equal to 1, 2 or 4 + unsigned VShlReg = DestReg; + unsigned Scale = 1; + switch (Imm8) { + case 1: + Scale = 2; + break; + case 2: + Scale = 4; + break; + case 4: + Scale = 8; + break; + default: + VShlReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShlReg, 0)); + MachineInstrBuilder ShlMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), VShlReg) + .addReg(DestReg) + .addImm(Imm8); + InsInstrs.push_back(ShlMI); + } + + unsigned VShrReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(VShrReg, 0)); + MachineInstrBuilder ShrMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), VShrReg) + .addReg(SrcReg) + .addImm(NBits - Imm8); + InsInstrs.push_back(ShrMI); + + MachineInstrBuilder LeaMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2]), ResultReg) + .addReg(VShrReg) + .addImm(Scale) + .addReg(VShlReg) + .addImm(0) // Disp + .addReg(0); // SegReg + InsInstrs.push_back(LeaMI); + } +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::SHLD2SHIFTS: + switch (Root.getOpcode()) { + default: + return; + // TODO: do we need sequences for SHLD16mrCL, SHLD32mrCL, SHLD64mrCL? + case X86::SHLD16mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16mi, X86::SHR16ri, X86::OR16mr}, 16); + break; + case X86::SHLD16rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16rCL, X86::MOV16ri, X86::SUB16rr, X86::MOV16ri, + X86::SHR16rCL, X86::OR16rr}); + break; + case X86::SHLD16rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL16ri, X86::SHR16ri, X86::LEA16r}, 16); + break; + case X86::SHLD32mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32mi, X86::SHR32ri, X86::OR32mr}, 32); + break; + case X86::SHLD32rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32rCL, X86::MOV32ri, X86::SUB32rr, X86::MOV32ri, + X86::SHR32rCL, X86::OR32rr}); + break; + case X86::SHLD32rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL32ri, X86::SHR32ri, X86::LEA32r}, 32); + break; + case X86::SHLD64mri8: + genShldMri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64mi, X86::SHR64ri, X86::OR64mr}, 64); + break; + case X86::SHLD64rrCL: + genShldCl(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64rCL, X86::MOV64ri, X86::SUB64rr, X86::MOV64ri, + X86::SHR64rCL, X86::OR64rr}); + break; + case X86::SHLD64rri8: + genShldRri(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + {X86::SHL64ri, X86::SHR64ri, X86::LEA64r}, 64); + break; + } + DelInstrs.push_back(&Root); // Record SHLD/SHRD for deletion + break; + } +} + +/// Find SHLD/SHRD instructions +static bool getSHLDPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + switch (Root.getOpcode()) { + default: + return false; + // TODO: do we need sequences for SHLD(XX)mrCL ? + // It seems they are too long: + // movq x(%rip), %rax # sched: [5:1.00] + // movl %esi, %ecx # sched: [1:0.50] + // shlq %cl, %rax # sched: [1:0.50] + // movl $64, %ecx # sched: [1:0.50] + // subl %esi, %ecx # sched: [1:0.50] + // shrq %cl, %rdi # sched: [1:0.50] + // orq %rax, %rdi # sched: [1:0.50] + // movq %rdi, x(%rip) # sched: [1:1.00] + // TODO: At the moment we support 64-bit only + // case X86::SHLD16mri8: + // case X86::SHLD16rrCL: + // case X86::SHLD16rri8: + // case X86::SHLD32mri8: + // case X86::SHLD32rrCL: + // case X86::SHLD32rri8: + case X86::SHLD64mri8: + case X86::SHLD64rrCL: + case X86::SHLD64rri8: + break; + } + Patterns.push_back(MachineCombinerPattern::SHLD2SHIFTS); + return true; +} + +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + // FDIV patterns + if (getSHLDPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} Index: test/CodeGen/X86/2008-07-11-SHLBy1.ll =================================================================== --- test/CodeGen/X86/2008-07-11-SHLBy1.ll +++ test/CodeGen/X86/2008-07-11-SHLBy1.ll @@ -1,5 +1,28 @@ -; RUN: llc < %s -mtriple=x86_64-- -o - | not grep shr +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -o - -mcpu=skylake -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=SKL +; RUN: llc < %s -mtriple=x86_64-- -o - -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-- -o - -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL define i128 @sl(i128 %x) { +; SKL-LABEL: sl: +; SKL: # %bb.0: +; SKL-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] +; SKL-NEXT: shrq $63, %rdi # sched: [1:0.50] +; SKL-NEXT: leaq (%rdi,%rsi,2), %rdx # sched: [1:0.50] +; SKL-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: sl: +; BTVER2: # %bb.0: +; BTVER2-NEXT: shldq $1, %rdi, %rsi # sched: [3:3.00] +; BTVER2-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: movq %rsi, %rdx # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; HASWELL-LABEL: sl: +; HASWELL: # %bb.0: +; HASWELL-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] +; HASWELL-NEXT: shrq $63, %rdi # sched: [1:0.50] +; HASWELL-NEXT: leaq (%rdi,%rsi,2), %rdx # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [7:1.00] %t = shl i128 %x, 1 ret i128 %t } Index: test/CodeGen/X86/known-bits.ll =================================================================== --- test/CodeGen/X86/known-bits.ll +++ test/CodeGen/X86/known-bits.ll @@ -152,7 +152,9 @@ ; X64-NEXT: andq $-1024, %rsi # imm = 0xFC00 ; X64-NEXT: addq %rdi, %rsi ; X64-NEXT: adcl $0, %edx -; X64-NEXT: shldq $54, %rsi, %rdx +; X64-NEXT: shlq $54, %rdx +; X64-NEXT: shrq $10, %rsi +; X64-NEXT: leaq (%rsi,%rdx), %rdx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: retq %1 = and i64 %a0, -1024 Index: test/CodeGen/X86/shift-double-x86_64.ll =================================================================== --- test/CodeGen/X86/shift-double-x86_64.ll +++ test/CodeGen/X86/shift-double-x86_64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64 -mcpu=btver2 | FileCheck %s ; SHLD/SHRD manual shifts Index: test/CodeGen/X86/x86-64-double-precision-shift-right.ll =================================================================== --- test/CodeGen/X86/x86-64-double-precision-shift-right.ll +++ test/CodeGen/X86/x86-64-double-precision-shift-right.ll @@ -1,6 +1,7 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 | FileCheck %s ; Verify that for the architectures that are known to have poor latency -; double precision shift instructions we generate alternative sequence +; double precision shift instructions we generate alternative sequence ; of instructions with lower latencies instead of shrd instruction. ;uint64_t rshift1(uint64_t a, uint64_t b) @@ -8,12 +9,13 @@ ; return (a >> 1) | (b << 63); ;} -; CHECK: rshift1: -; CHECK: shrq {{.*}} -; CHECK-NEXT: shlq $63, {{.*}} -; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} - define i64 @rshift1(i64 %a, i64 %b) nounwind readnone uwtable { +; CHECK-LABEL: rshift1: +; CHECK: # %bb.0: +; CHECK-NEXT: shlq $63, %rsi +; CHECK-NEXT: shrq $1, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax +; CHECK-NEXT: retq %1 = lshr i64 %a, 1 %2 = shl i64 %b, 63 %3 = or i64 %2, %1 @@ -25,13 +27,13 @@ ; return (a >> 2) | (b << 62); ;} -; CHECK: rshift2: -; CHECK: shrq $2, {{.*}} -; CHECK-NEXT: shlq $62, {{.*}} -; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} - - define i64 @rshift2(i64 %a, i64 %b) nounwind readnone uwtable { +; CHECK-LABEL: rshift2: +; CHECK: # %bb.0: +; CHECK-NEXT: shlq $62, %rsi +; CHECK-NEXT: shrq $2, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax +; CHECK-NEXT: retq %1 = lshr i64 %a, 2 %2 = shl i64 %b, 62 %3 = or i64 %2, %1 @@ -43,13 +45,13 @@ ; return (a >> 7) | (b << 57); ;} -; CHECK: rshift7: -; CHECK: shrq $7, {{.*}} -; CHECK-NEXT: shlq $57, {{.*}} -; CHECK-NEXT: leaq ({{.*}},{{.*}}), {{.*}} - - define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable { +; CHECK-LABEL: rshift7: +; CHECK: # %bb.0: +; CHECK-NEXT: shlq $57, %rsi +; CHECK-NEXT: shrq $7, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi), %rax +; CHECK-NEXT: retq %1 = lshr i64 %a, 7 %2 = shl i64 %b, 57 %3 = or i64 %2, %1 @@ -61,11 +63,12 @@ ; return (a >> 63) | (b << 1); ;} -; CHECK-LABEL: rshift63: -; CHECK: shrq $63, %rdi -; CHECK-NEXT: leaq (%rdi,%rsi,2), %rax - define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable { +; CHECK-LABEL: rshift63: +; CHECK: # %bb.0: +; CHECK-NEXT: shrq $63, %rdi +; CHECK-NEXT: leaq (%rdi,%rsi,2), %rax +; CHECK-NEXT: retq %1 = lshr i64 %a, 63 %2 = shl i64 %b, 1 %3 = or i64 %2, %1 Index: test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll =================================================================== --- test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll +++ test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll @@ -1,8 +1,9 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s ; clang -Oz -c test1.cpp -emit-llvm -S -o ; Verify that we generate shld insruction when we are optimizing for size, -; even for X86_64 processors that are known to have poor latency double +; even for X86_64 processors that are known to have poor latency double ; precision shift instructions. ; uint64_t lshift10(uint64_t a, uint64_t b) ; { @@ -11,8 +12,12 @@ ; Function Attrs: minsize nounwind readnone uwtable define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 { +; CHECK-LABEL: _Z8lshift10mm: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shldq $10, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq entry: -; CHECK: shldq $10 %shl = shl i64 %a, 10 %shr = lshr i64 %b, 54 %or = or i64 %shr, %shl @@ -33,8 +38,12 @@ ; Function Attrs: nounwind optsize readnone uwtable define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 { +; CHECK-LABEL: _Z8lshift11mm: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shldq $11, %rsi, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq entry: -; CHECK: shldq $11 %shl = shl i64 %a, 11 %shr = lshr i64 %b, 53 %or = or i64 %shr, %shl @@ -54,9 +63,13 @@ ; Function Attrs: nounwind optsize readnone uwtable define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 { +; CHECK-LABEL: _Z8lshift12mm: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: shlq $12, %rdi +; CHECK-NEXT: shrq $52, %rsi +; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: retq entry: -; CHECK: shlq $12 -; CHECK-NEXT: shrq $52 %shl = shl i64 %a, 12 %shr = lshr i64 %b, 52 %or = or i64 %shr, %shl Index: test/CodeGen/X86/x86-64-double-shifts-var.ll =================================================================== --- test/CodeGen/X86/x86-64-double-shifts-var.ll +++ test/CodeGen/X86/x86-64-double-shifts-var.ll @@ -18,6 +18,7 @@ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s +; XFAIL: * ; Verify that for the X86_64 processors that are known to have poor latency ; double precision shift instructions we do not generate 'shld' or 'shrd'