Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -182,6 +182,8 @@ "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; +def FeatureSAHF : SubtargetFeature<"sahf", "HasSAHF", "true", + "Support SAHF and LAHF instructions">; def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", "Support MPX instructions">; def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", @@ -273,7 +275,8 @@ FeatureSSSE3, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem + FeatureSlowBTMem, + FeatureSAHF ]>; def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureSlowUAMem16, @@ -281,7 +284,8 @@ FeatureSSE41, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem + FeatureSlowBTMem, + FeatureSAHF ]>; // Atom CPUs. @@ -299,7 +303,8 @@ FeatureSlowDivide64, FeatureCallRegIndirect, FeatureLEAUsesAG, - FeaturePadShortFunctions + FeaturePadShortFunctions, + FeatureSAHF ]>; def : BonnellProc<"bonnell">; def : BonnellProc<"atom">; // Pin the generic name to the baseline. @@ -319,7 +324,8 @@ FeaturePRFCHW, FeatureSlowLEA, FeatureSlowIncDec, - FeatureSlowBTMem + FeatureSlowBTMem, + FeatureSAHF ]>; def : SilvermontProc<"silvermont">; def : SilvermontProc<"slm">; // Legacy alias. @@ -331,7 +337,8 @@ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeaturePOPCNT + FeaturePOPCNT, + FeatureSAHF ]>; def : NehalemProc<"nehalem">; def : NehalemProc<"corei7">; @@ -346,7 +353,8 @@ FeatureSlowBTMem, FeaturePOPCNT, FeatureAES, - FeaturePCLMUL + FeaturePCLMUL, + FeatureSAHF ]>; def : WestmereProc<"westmere">; @@ -363,7 +371,8 @@ FeatureAES, FeaturePCLMUL, FeatureXSAVE, - FeatureXSAVEOPT + FeatureXSAVEOPT, + FeatureSAHF ]>; def : SandyBridgeProc<"sandybridge">; def : SandyBridgeProc<"corei7-avx">; // Legacy alias. @@ -382,7 +391,8 @@ FeatureXSAVEOPT, FeatureRDRAND, FeatureF16C, - FeatureFSGSBase + FeatureFSGSBase, + FeatureSAHF ]>; def : IvyBridgeProc<"ivybridge">; def : IvyBridgeProc<"core-avx-i">; // Legacy alias. @@ -408,7 +418,8 @@ FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec + FeatureSlowIncDec, + FeatureSAHF ]>; def : HaswellProc<"haswell">; def : HaswellProc<"core-avx2">; // Legacy alias. @@ -436,7 +447,8 @@ FeatureHLE, FeatureADX, FeatureRDSEED, - FeatureSlowIncDec + FeatureSlowIncDec, + FeatureSAHF ]>; def : BroadwellProc<"broadwell">; @@ -465,7 +477,8 @@ FeatureRTM, FeatureHLE, FeatureSlowIncDec, - FeatureMPX + FeatureMPX, + FeatureSAHF ]>; def : KnightsLandingProc<"knl">; @@ -500,7 +513,8 @@ FeatureSlowIncDec, FeatureMPX, FeatureXSAVEC, - FeatureXSAVES + FeatureXSAVES, + FeatureSAHF ]>; def : SkylakeProc<"skylake">; def : SkylakeProc<"skx">; // Legacy alias. @@ -560,7 +574,8 @@ FeatureLZCNT, FeaturePOPCNT, FeatureXSAVE, - FeatureSlowSHLD + FeatureSlowSHLD, + FeatureSAHF ]>; // Jaguar @@ -580,7 +595,8 @@ FeaturePOPCNT, FeatureXSAVE, FeatureXSAVEOPT, - FeatureSlowSHLD + FeatureSlowSHLD, + FeatureSAHF ]>; // Bulldozer @@ -598,7 +614,8 @@ FeatureLZCNT, FeaturePOPCNT, FeatureXSAVE, - FeatureSlowSHLD + FeatureSlowSHLD, + FeatureSAHF ]>; // Piledriver def : Proc<"bdver2", [ @@ -619,7 +636,8 @@ FeatureBMI, FeatureTBM, FeatureFMA, - FeatureSlowSHLD + FeatureSlowSHLD, + FeatureSAHF ]>; // Steamroller @@ -643,7 +661,8 @@ FeatureFMA, FeatureXSAVEOPT, FeatureSlowSHLD, - FeatureFSGSBase + FeatureFSGSBase, + FeatureSAHF ]>; // Excavator @@ -666,7 +685,8 @@ FeatureTBM, FeatureFMA, FeatureXSAVEOPT, - FeatureFSGSBase + FeatureFSGSBase, + FeatureSAHF ]>; def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13930,6 +13930,9 @@ SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -4380,59 +4380,91 @@ return; } - bool FromEFLAGS = SrcReg == X86::EFLAGS; - bool ToEFLAGS = DestReg == X86::EFLAGS; - int Reg = FromEFLAGS ? DestReg : SrcReg; - bool is32 = X86::GR32RegClass.contains(Reg); - bool is64 = X86::GR64RegClass.contains(Reg); - if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { - // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is - // inefficient. Instead: - // - Save the overflow flag OF into AL using SETO, and restore it using a - // signed 8-bit addition of AL and INT8_MAX. - // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH - // using LAHF/SAHF. - // - When RAX/EAX is live and isn't the destination register, make sure it - // isn't clobbered by PUSH/POP'ing it before and after saving/restoring - // the flags. - // This approach is ~2.25x faster than using PUSHF/POPF. - // - // This is still somewhat inefficient because we don't know which flags are - // actually live inside EFLAGS. Were we able to do a single SETcc instead of - // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. - // - // PUSHF/POPF is also potentially incorrect because it affects other flags - // such as TF/IF/DF, which LLVM doesn't model. - // - // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - clobbersTheStack. - - int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; - int Push = is64 ? X86::PUSH64r : X86::PUSH32r; - int Pop = is64 ? X86::POP64r : X86::POP32r; - int AX = is64 ? X86::RAX : X86::EAX; - - bool AXDead = (Reg == AX) || - (MachineBasicBlock::LQR_Dead == - MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI)); - - if (!AXDead) - BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); - if (FromEFLAGS) { - BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); - BuildMI(MBB, MI, DL, get(X86::LAHF)); - BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); - } - if (ToEFLAGS) { - BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) - .addReg(X86::AL) - .addImm(INT8_MAX); - BuildMI(MBB, MI, DL, get(X86::SAHF)); + if (Subtarget.hasSAHF()) { + bool FromEFLAGS = SrcReg == X86::EFLAGS; + bool ToEFLAGS = DestReg == X86::EFLAGS; + int Reg = FromEFLAGS ? DestReg : SrcReg; + bool is32 = X86::GR32RegClass.contains(Reg); + bool is64 = X86::GR64RegClass.contains(Reg); + if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { + // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is + // inefficient. Instead: + // - Save the overflow flag OF into AL using SETO, and restore it using + // a signed 8-bit addition of AL and INT8_MAX. + // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from + // AH using LAHF/SAHF. + // - When RAX/EAX is live and isn't the destination register, make sure + // it isn't clobbered by PUSH/POP'ing it before and after + // saving/restoring the flags. + // This approach is ~2.25x faster than using PUSHF/POPF. + // + // This is still somewhat inefficient because we don't know which flags + // are actually live inside EFLAGS. Were we able to do a single SETcc + // instead of SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. + // + // PUSHF/POPF is also potentially incorrect because it affects other flags + // such as TF/IF/DF, which LLVM doesn't model. + // + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - clobbersTheStack. + + int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; + int Push = is64 ? X86::PUSH64r : X86::PUSH32r; + int Pop = is64 ? X86::POP64r : X86::POP32r; + int AX = is64 ? X86::RAX : X86::EAX; + + bool AXDead = (Reg == AX) || + (MachineBasicBlock::LQR_Dead == + MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI)); + + if (!AXDead) + BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); + BuildMI(MBB, MI, DL, get(X86::LAHF)); + BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) + .addReg(X86::AL) + .addImm(INT8_MAX); + BuildMI(MBB, MI, DL, get(X86::SAHF)); + } + if (!AXDead) + BuildMI(MBB, MI, DL, get(Pop), AX); + return; } - if (!AXDead) - BuildMI(MBB, MI, DL, get(Pop), AX); - return; + } else { + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - clobbersTheStack. + if (SrcReg == X86::EFLAGS) { + if (X86::GR64RegClass.contains(DestReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSHF64)); + BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg); + return; + } + if (X86::GR32RegClass.contains(DestReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSHF32)); + BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg); + return; + } + } + if (DestReg == X86::EFLAGS) { + if (X86::GR64RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSH64r)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::POPF64)); + return; + } + if (X86::GR32RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSH32r)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::POPF32)); + return; + } + } } DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -799,6 +799,7 @@ def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasSAHF : Predicate<"Subtarget->hasSAHF()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasMPX : Predicate<"Subtarget->hasMPX()">; @@ -1502,10 +1503,12 @@ let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", - [(set EFLAGS, (X86sahf AH))], IIC_AHF>; + [(set EFLAGS, (X86sahf AH))], IIC_AHF>, + Requires<[HasSAHF]>; let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], - IIC_AHF>; // AH = flags + IIC_AHF>, // AH = flags + Requires<[HasSAHF]>; } // SchedRW //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -152,6 +152,9 @@ /// Processor has RDSEED instructions. bool HasRDSEED; + /// Processor has SAHF/LAHF instructions. + bool HasSAHF; + /// True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; @@ -374,6 +377,7 @@ bool hasSHA() const { return HasSHA; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } + bool hasSAHF() const { return HasSAHF; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -189,6 +189,17 @@ FullFS = "+64bit,+sse2"; } +#if 1 + // XXX + if (!In64BitMode) { + if (!FullFS.empty()) + FullFS = "+sahf," + FullFS; + else + FullFS = "+sahf"; + } +#endif + + // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); @@ -264,6 +275,7 @@ HasSHA = false; HasPRFCHW = false; HasRDSEED = false; + HasSAHF = false; HasMPX = false; IsBTMemSlow = false; IsSHLDSlow = false; Index: test/CodeGen/X86/cmpxchg-clobber-flags.ll =================================================================== --- test/CodeGen/X86/cmpxchg-clobber-flags.ll +++ test/CodeGen/X86/cmpxchg-clobber-flags.ll @@ -1,7 +1,11 @@ ; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386 ; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f + ; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664 +; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf +; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf +; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf declare i32 @foo() declare i32 @bar(i64) @@ -37,18 +41,28 @@ ; x8664-LABEL: test_intervening_call: ; x8664: cmpxchgq -; x8664: pushq %rax -; x8664-NEXT: seto %al -; x8664-NEXT: lahf -; x8664-NEXT: movq %rax, [[FLAGS:%.*]] -; x8664-NEXT: popq %rax +; x8664: pushfq +; x8664-NEXT: popq [[FLAGS:%.*]] ; x8664-NEXT: movq %rax, %rdi ; x8664-NEXT: callq bar -; x8664-NEXT: movq [[FLAGS]], %rax -; x8664-NEXT: addb $127, %al -; x8664-NEXT: sahf +; x8664-NEXT: pushq [[FLAGS]] +; x8664-NEXT: popfq ; x8664-NEXT: jne +; x8664-sahf-LABEL: test_intervening_call: +; x8664-sahf: cmpxchgq +; x8664-sahf: pushq %rax +; x8664-sahf-NEXT: seto %al +; x8664-sahf-NEXT: lahf +; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]] +; x8664-sahf-NEXT: popq %rax +; x8664-sahf-NEXT: movq %rax, %rdi +; x8664-sahf-NEXT: callq bar +; x8664-sahf-NEXT: movq [[FLAGS]], %rax +; x8664-sahf-NEXT: addb $127, %al +; x8664-sahf-NEXT: sahf +; x8664-sahf-NEXT: jne + %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst %v = extractvalue { i64, i1 } %cx, 0 %p = extractvalue { i64, i1 } %cx, 1 @@ -76,6 +90,10 @@ ; x8664: cmpxchg ; x8664-NEXT: jne +; x8664-sahf-LABEL: test_control_flow: +; x8664-sahf: cmpxchg +; x8664-sahf-NEXT: jne + entry: %cmp = icmp sgt i32 %i, %j br i1 %cmp, label %loop_start, label %cond.end @@ -134,16 +152,24 @@ ; i386f-NEXT: popl %eax ; x8664-LABEL: test_feed_cmov: -; x8664: cmpxchgl -; x8664: seto %al -; x8664-NEXT: lahf -; x8664-NEXT: movq %rax, [[FLAGS:%.*]] +; x8664: cmpxchg +; x8664: pushfq +; x8664-NEXT: popq [[FLAGS:%.*]] ; x8664-NEXT: callq foo -; x8664-NEXT: pushq %rax -; x8664-NEXT: movq [[FLAGS]], %rax -; x8664-NEXT: addb $127, %al -; x8664-NEXT: sahf -; x8664-NEXT: popq %rax +; x8664-NEXT: pushq [[FLAGS]] +; x8664-NEXT: popfq + +; x8664-sahf-LABEL: test_feed_cmov: +; x8664-sahf: cmpxchgl +; x8664-sahf: seto %al +; x8664-sahf-NEXT: lahf +; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]] +; x8664-sahf-NEXT: callq foo +; x8664-sahf-NEXT: pushq %rax +; x8664-sahf-NEXT: movq [[FLAGS]], %rax +; x8664-sahf-NEXT: addb $127, %al +; x8664-sahf-NEXT: sahf +; x8664-sahf-NEXT: popq %rax %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst %success = extractvalue { i32, i1 } %res, 1