Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -31,6 +31,7 @@ const int IndirectCallThreshold = 100; const int CallPenalty = 25; const int LastCallToStaticBonus = -15000; + const int LastCallToLinkOnceODRBonus = -50; const int ColdccPenalty = 2000; const int NoreturnPenalty = 10000; /// Do not inline functions which allocate this many bytes on the stack Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -41,7 +41,21 @@ // Threshold to use when optsize is specified (and there is no // -inline-threshold). -const int OptSizeThreshold = 75; +cl::opt OptSizeThreshold("inlineoptsize-threshold", cl::Hidden, + cl::init(75), cl::desc("Threshold for inlining " + "functions with -Os")); + +cl::opt InlineSoftFloatFix("inline-soft-float-fix", cl::Hidden, + cl::init(false), + cl::desc("Avoid overweighting soft float " + "loads & stores for inlining")); + +cl::opt InlineLinkOnceODRFix("inline-link-once-odr-fix", cl::Hidden, + cl::init(false), + cl::desc("Give a small bonus to inlining " + "single call LinkOnceODR functions " + "in the hope that the definition " + "can be eliminated.")); // Threshold to use when -Oz is specified (and there is no -inline-threshold). const int OptMinSizeThreshold = 25; @@ -1053,6 +1067,13 @@ // eventually become a library call. Treat the cost as such. if (I->getType()->isFloatingPointTy()) { bool hasSoftFloatAttr = false; + // Do not count loads and stores, as they do not get + // transformed into calls. + bool isLoadStore = false; + if (isa(I)) + isLoadStore = true; + if (isa(I)) + isLoadStore = true; // If the function has the "use-soft-float" attribute, mark it as // expensive. @@ -1064,7 +1085,7 @@ } if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || - hasSoftFloatAttr) + (hasSoftFloatAttr && (!InlineSoftFloatFix || !isLoadStore))) Cost += InlineConstants::CallPenalty; } @@ -1214,6 +1235,10 @@ if (OnlyOneCallAndLocalLinkage) Cost += InlineConstants::LastCallToStaticBonus; + if (InlineLinkOnceODRFix && F.hasLinkOnceODRLinkage() && F.hasOneUse() && + &F == CS.getCalledFunction()) + Cost += InlineConstants::LastCallToLinkOnceODRBonus; + // If the normal destination of the invoke or the parent block of the call // site is unreachable-terminated, there is little point in inlining this // unless there is literally zero cost. Index: lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- lib/CodeGen/PrologEpilogInserter.cpp +++ lib/CodeGen/PrologEpilogInserter.cpp @@ -49,6 +49,9 @@ #define DEBUG_TYPE "pei" +static cl::opt ApplyR260917("apply-r260917", cl::Hidden, cl::init(false), + cl::desc("Apply revision 260917")); + namespace { class PEI : public MachineFunctionPass { public: @@ -731,7 +734,8 @@ } // Give the targets a chance to order the objects the way they like it. if (Fn.getTarget().getOptLevel() != CodeGenOpt::None && - Fn.getTarget().Options.StackSymbolOrdering) + Fn.getTarget().Options.StackSymbolOrdering && + ApplyR260917) TFI.orderFrameObjects(Fn, ObjectsToAllocate); // Now walk the objects and actually assign base offsets to them. Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -35,6 +35,9 @@ #include using namespace llvm; +static cl::opt ApplyR242280("apply-r242280", cl::Hidden, cl::init(false), + cl::desc("Apply revision 242280")); + /// NOTE: The TargetMachine owns TLOF. TargetLowering::TargetLowering(const TargetMachine &tm) : TargetLoweringBase(tm) {} @@ -193,30 +196,64 @@ (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128; break; default: - // Invert CC for unordered comparisons - ShouldInvertCC = true; - switch (CCCode) { - case ISD::SETULT: - LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 : - (VT == MVT::f64) ? RTLIB::OGE_F64 : - (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128; - break; - case ISD::SETULE: - LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 : - (VT == MVT::f64) ? RTLIB::OGT_F64 : - (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128; - break; - case ISD::SETUGT: - LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 : - (VT == MVT::f64) ? RTLIB::OLE_F64 : - (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128; - break; - case ISD::SETUGE: - LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : - (VT == MVT::f64) ? RTLIB::OLT_F64 : - (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128; - break; - default: llvm_unreachable("Do not know how to soften this setcc!"); + if (ApplyR242280) { + // Invert CC for unordered comparisons + ShouldInvertCC = true; + switch (CCCode) { + case ISD::SETULT: + LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 : + (VT == MVT::f64) ? RTLIB::OGE_F64 : + (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128; + break; + case ISD::SETULE: + LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 : + (VT == MVT::f64) ? RTLIB::OGT_F64 : + (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128; + break; + case ISD::SETUGT: + LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 : + (VT == MVT::f64) ? RTLIB::OLE_F64 : + (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128; + break; + case ISD::SETUGE: + LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : + (VT == MVT::f64) ? RTLIB::OLT_F64 : + (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128; + break; + default: llvm_unreachable("Do not know how to soften this setcc!"); + } + } + else { + LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : + (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128; + switch (CCCode) { + case ISD::SETONE: + // SETONE = SETOLT | SETOGT + LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : + (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128; + // Fallthrough + case ISD::SETUGT: + LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 : + (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128; + break; + case ISD::SETUGE: + LC2 = (VT == MVT::f32) ? RTLIB::OGE_F32 : + (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128; + break; + case ISD::SETULT: + LC2 = (VT == MVT::f32) ? RTLIB::OLT_F32 : + (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128; + break; + case ISD::SETULE: + LC2 = (VT == MVT::f32) ? RTLIB::OLE_F32 : + (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128; + break; + case ISD::SETUEQ: + LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : + (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128; + break; + default: llvm_unreachable("Do not know how to soften this setcc!"); + } } } Index: lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -29,6 +29,9 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +static cl::opt ApplyR241152("apply-r241152", cl::Hidden, cl::init(false), + cl::desc("Apply revision 241152")); + static unsigned getFixupKindLog2Size(unsigned Kind) { switch (Kind) { default: @@ -256,12 +259,30 @@ if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode()) return false; + if (ApplyR241152) { + // Check if the relaxable operand has an expression. For the current set of + // relaxable instructions, the relaxable operand is always the last operand. + unsigned RelaxableOp = Inst.getNumOperands() - 1; + if (Inst.getOperand(RelaxableOp).isExpr()) + return true; + } + else { + // Check if it has an expression and is not RIP relative. + bool hasExp = false; + bool hasRIP = false; + for (unsigned i = 0; i < Inst.getNumOperands(); ++i) { + const MCOperand &Op = Inst.getOperand(i); + if (Op.isExpr()) + hasExp = true; - // Check if the relaxable operand has an expression. For the current set of - // relaxable instructions, the relaxable operand is always the last operand. - unsigned RelaxableOp = Inst.getNumOperands() - 1; - if (Inst.getOperand(RelaxableOp).isExpr()) - return true; + if (Op.isReg() && Op.getReg() == X86::RIP) + hasRIP = true; + } + + // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on + // how we do relaxations? + return hasExp && !hasRIP; + } return false; } Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -31,6 +31,9 @@ // X86 Subtarget features //===----------------------------------------------------------------------===// +def FeatureX87 : SubtargetFeature<"x87","HasX87", "true", + "Enable X87 float instructions">; + def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", "Enable conditional move instructions">; @@ -259,37 +262,41 @@ class Proc Features> : ProcessorModel; -def : Proc<"generic", [FeatureSlowUAMem16]>; -def : Proc<"i386", [FeatureSlowUAMem16]>; -def : Proc<"i486", [FeatureSlowUAMem16]>; -def : Proc<"i586", [FeatureSlowUAMem16]>; -def : Proc<"pentium", [FeatureSlowUAMem16]>; -def : Proc<"pentium-mmx", [FeatureSlowUAMem16, FeatureMMX]>; -def : Proc<"i686", [FeatureSlowUAMem16]>; -def : Proc<"pentiumpro", [FeatureSlowUAMem16, FeatureCMOV]>; -def : Proc<"pentium2", [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV, - FeatureFXSR]>; -def : Proc<"pentium3", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, - FeatureFXSR]>; -def : Proc<"pentium3m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, - FeatureFXSR, FeatureSlowBTMem]>; -def : Proc<"pentium-m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureSlowBTMem]>; -def : Proc<"pentium4", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, - FeatureFXSR]>; -def : Proc<"pentium4m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>; +def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureCMOV, FeatureFXSR]>; +def : Proc<"pentium3", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE1, FeatureFXSR]>; +def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR]>; +def : Proc<"pentium4m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; + +// Intel Quark. +def : Proc<"lakemont", []>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, - FeatureSlowBTMem]>; + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureSlowBTMem]>; // NetBurst. def : Proc<"prescott", - [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, - FeatureSlowBTMem]>; + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureSlowBTMem]>; def : Proc<"nocona", [ + FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, @@ -300,6 +307,7 @@ // Intel Core 2 Solo/Duo. def : ProcessorModel<"core2", SandyBridgeModel, [ + FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSSE3, @@ -309,6 +317,7 @@ FeatureLAHFSAHF ]>; def : ProcessorModel<"penryn", SandyBridgeModel, [ + FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE41, @@ -321,6 +330,7 @@ // Atom CPUs. class BonnellProc : ProcessorModel : ProcessorModel : ProcessorModel : ProcessorModel; -def : Proc<"k6-2", [FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"k6-3", [FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"athlon", [FeatureSlowUAMem16, Feature3DNowA, +def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"athlon", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-tbird", [FeatureSlowUAMem16, Feature3DNowA, +def : Proc<"athlon-tbird", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, - FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-xp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, - FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-mp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, - FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, - FeatureFXSR, Feature64Bit, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"opteron", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, - FeatureFXSR, Feature64Bit, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"athlon64", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, - FeatureFXSR, Feature64Bit, FeatureSlowBTMem, +def : Proc<"athlon-4", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1, + Feature3DNowA, FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-fx", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, - FeatureFXSR, Feature64Bit, FeatureSlowBTMem, +def : Proc<"athlon-xp", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1, + Feature3DNowA, FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, - FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, +def : Proc<"athlon-mp", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1, + Feature3DNowA, FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, - FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"athlon64-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, - FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, - FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, - FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; -def : Proc<"barcelona", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, - FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, - FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; +def : Proc<"k8", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, + Feature3DNowA, FeatureFXSR, Feature64Bit, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"opteron", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, + Feature3DNowA, FeatureFXSR, Feature64Bit, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon64", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, + Feature3DNowA, FeatureFXSR, Feature64Bit, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-fx", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, + Feature3DNowA, FeatureFXSR, Feature64Bit, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"k8-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, + Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"opteron-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, + Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon64-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, + Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"amdfam10", [FeatureX87, FeatureSSE4A, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD, + FeatureLAHFSAHF]>; +def : Proc<"barcelona", [FeatureX87, FeatureSSE4A, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD, + FeatureLAHFSAHF]>; // Bobcat def : Proc<"btver1", [ + FeatureX87, FeatureMMX, FeatureSSSE3, FeatureSSE4A, @@ -581,6 +601,7 @@ // Jaguar def : ProcessorModel<"btver2", BtVer2Model, [ + FeatureX87, FeatureMMX, FeatureAVX, FeatureFXSR, @@ -603,6 +624,7 @@ // Bulldozer def : Proc<"bdver1", [ + FeatureX87, FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, @@ -621,6 +643,7 @@ ]>; // Piledriver def : Proc<"bdver2", [ + FeatureX87, FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, @@ -644,6 +667,7 @@ // Steamroller def : Proc<"bdver3", [ + FeatureX87, FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, @@ -669,6 +693,7 @@ // Excavator def : Proc<"bdver4", [ + FeatureX87, FeatureMMX, FeatureAVX2, FeatureFXSR, @@ -691,12 +716,13 @@ FeatureLAHFSAHF ]>; -def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>; +def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>; -def : Proc<"winchip-c6", [FeatureSlowUAMem16, FeatureMMX]>; -def : Proc<"winchip2", [FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"c3", [FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>; +def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE1, FeatureFXSR]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -709,8 +735,8 @@ // knobs which need to be tuned differently for AMD chips, we might consider // forming a common base for them. def : ProcessorModel<"x86-64", SandyBridgeModel, - [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit, - FeatureSlowBTMem ]>; + [FeatureX87, FeatureMMX, FeatureSSE2, FeatureFXSR, + Feature64Bit, FeatureSlowBTMem ]>; //===----------------------------------------------------------------------===// // Register File Description Index: lib/Target/X86/X86CallFrameOptimization.cpp =================================================================== --- lib/Target/X86/X86CallFrameOptimization.cpp +++ lib/Target/X86/X86CallFrameOptimization.cpp @@ -41,7 +41,7 @@ static cl::opt NoX86CFOpt("no-x86-call-frame-opt", cl::desc("Avoid optimizing x86 call frames for size"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); namespace { class X86CallFrameOptimization : public MachineFunctionPass { Index: lib/Target/X86/X86FixupLEAs.cpp =================================================================== --- lib/Target/X86/X86FixupLEAs.cpp +++ lib/Target/X86/X86FixupLEAs.cpp @@ -29,6 +29,9 @@ #define DEBUG_TYPE "x86-fixup-LEAs" +static cl::opt ApplyR252722("apply-r252722", cl::Hidden, cl::init(false), + cl::desc("Apply revision 252722")); + STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { @@ -161,6 +164,8 @@ OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); OptLEA = ST.LEAusesAG() || ST.slowLEA(); + if (!ApplyR252722) OptIncDec = false; + if (!OptLEA && !OptIncDec) return false; Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -39,6 +39,9 @@ #define DEBUG_TYPE "x86-isel" +static cl::opt ApplyR244601("apply-r244601", cl::Hidden, cl::init(false), + cl::desc("Apply revision 244601")); + STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); //===----------------------------------------------------------------------===// @@ -295,6 +298,9 @@ if (!OptForSize) return false; + if (!ApplyR244601) + return false; + // Walk all the users of the immediate. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -69,9 +69,14 @@ "rather than promotion."), cl::Hidden); +static cl::opt Revert195496( + "revert-r195496", cl::init(false), + cl::desc("Revert r195496 to improve code size"), cl::Hidden); + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { + bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); X86ScalarSSEf64 = Subtarget.hasSSE2(); X86ScalarSSEf32 = Subtarget.hasSSE1(); MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); @@ -557,7 +562,7 @@ // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32) { + } else if (UseX87 && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); @@ -592,7 +597,7 @@ setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } - } else if (!Subtarget.useSoftFloat()) { + } else if (UseX87) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); @@ -626,7 +631,7 @@ setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87, except f128 in MMX. - if (!Subtarget.useSoftFloat()) { + if (UseX87) { if (Subtarget.is64Bit() && Subtarget.hasMMX()) { addRegisterClass(MVT::f128, &X86::FR128RegClass); ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); @@ -2417,6 +2422,8 @@ bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { + if (!Subtarget.hasX87()) + report_fatal_error("X87 register return with X87 disabled"); CopyVT = MVT::f80; RoundAfterCopy = (CopyVT != VA.getLocVT()); } @@ -14605,7 +14612,8 @@ // This avoids subregister aliasing issues. Keep the smaller reference // if we're optimizing for size, however, as that'll allow better folding // of memory operations. - if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && + if (!Revert195496 && + Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && !DAG.getMachineFunction().getFunction()->optForMinSize() && !Subtarget.isAtom()) { unsigned ExtendOp = Index: lib/Target/X86/X86OptimizeLEAs.cpp =================================================================== --- lib/Target/X86/X86OptimizeLEAs.cpp +++ lib/Target/X86/X86OptimizeLEAs.cpp @@ -39,7 +39,7 @@ static cl::opt DisableX86LEAOpt("disable-x86-lea-opt", cl::Hidden, cl::desc("X86: Disable LEA optimizations."), - cl::init(false)); + cl::init(true)); STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed"); Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -70,6 +70,9 @@ /// MMX, 3DNow, 3DNow Athlon, or none supported. X863DNowEnum X863DNowLevel; + /// True if the processor supports X87 instructions. + bool HasX87; + /// True if this processor has conditional move instructions /// (generally pentium pro+). bool HasCMov; @@ -370,6 +373,7 @@ PICStyles::Style getPICStyle() const { return PICStyle; } void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } + bool hasX87() const { return HasX87; } bool hasCMov() const { return HasCMov; } bool hasSSE1() const { return X86SSELevel >= SSE1; } bool hasSSE2() const { return X86SSELevel >= SSE2; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -239,6 +239,7 @@ void X86Subtarget::initializeEnvironment() { X86SSELevel = NoSSE; X863DNowLevel = NoThreeDNow; + HasX87 = false; HasCMov = false; HasX86_64 = false; HasPOPCNT = false; Index: tools/clang/include/clang/Driver/Options.td =================================================================== --- tools/clang/include/clang/Driver/Options.td +++ tools/clang/include/clang/Driver/Options.td @@ -1264,6 +1264,8 @@ def m64 : Flag<["-"], "m64">, Group, Flags<[DriverOption, CoreOption]>; def mx32 : Flag<["-"], "mx32">, Group, Flags<[DriverOption, CoreOption]>; def mabi_EQ : Joined<["-"], "mabi=">, Group; +def miamcu : Flag<["-"], "miamcu">, Group, Flags<[DriverOption, CoreOption]>, + HelpText<"Use Intel MCU ABI.">; def malign_functions_EQ : Joined<["-"], "malign-functions=">, Group; def malign_loops_EQ : Joined<["-"], "malign-loops=">, Group; def malign_jumps_EQ : Joined<["-"], "malign-jumps=">, Group; Index: tools/clang/lib/Basic/Targets.cpp =================================================================== --- tools/clang/lib/Basic/Targets.cpp +++ tools/clang/lib/Basic/Targets.cpp @@ -2251,6 +2251,10 @@ /// Knights Landing processor. CK_KNL, + /// \name Lakemont + /// Lakemont microarchitecture based processors. + CK_Lakemont, + /// \name K6 /// K6 architecture processors. //@{ @@ -2355,6 +2359,7 @@ .Case("skx", CK_SkylakeServer) // Legacy name. .Case("cannonlake", CK_Cannonlake) .Case("knl", CK_KNL) + .Case("lakemont", CK_Lakemont) .Case("k6", CK_K6) .Case("k6-2", CK_K6_2) .Case("k6-3", CK_K6_3) @@ -2502,6 +2507,7 @@ case CK_C3_2: case CK_Pentium4: case CK_Pentium4M: + case CK_Lakemont: case CK_Prescott: case CK_K6: case CK_K6_2: @@ -2594,7 +2600,13 @@ if (getTriple().getArch() == llvm::Triple::x86_64) setFeatureEnabledImpl(Features, "sse2", true); - switch (getCPUKind(CPU)) { + CPUKind Kind = getCPUKind(CPU); + + // Enable X87 for all X86 processors but Lakemont. + if (Kind != CK_Lakemont) + setFeatureEnabledImpl(Features, "x87", true); + + switch (Kind) { case CK_Generic: case CK_i386: case CK_i486: @@ -2602,6 +2614,7 @@ case CK_Pentium: case CK_i686: case CK_PentiumPro: + case CK_Lakemont: break; case CK_PentiumMMX: case CK_Pentium2: @@ -3259,6 +3272,10 @@ case CK_KNL: defineCPUMacros(Builder, "knl"); break; + case CK_Lakemont: + defineCPUMacros(Builder, "iamcu", false); + Builder.defineMacro("__tune_lakemont__"); + break; case CK_K6_2: Builder.defineMacro("__k6_2__"); Builder.defineMacro("__tune_k6_2__"); Index: tools/clang/lib/Driver/Tools.h =================================================================== --- tools/clang/lib/Driver/Tools.h +++ tools/clang/lib/Driver/Tools.h @@ -124,6 +124,8 @@ : Tool("clang::as", "clang integrated assembler", TC, RF_Full) {} void AddMIPSTargetArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const; + void AddX86TargetArgs(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const; bool hasGoodDiagnostics() const override { return true; } bool hasIntegratedAssembler() const override { return false; } bool hasIntegratedCPP() const override { return false; } Index: tools/clang/lib/Driver/Tools.cpp =================================================================== --- tools/clang/lib/Driver/Tools.cpp +++ tools/clang/lib/Driver/Tools.cpp @@ -2053,6 +2053,18 @@ << A->getOption().getName() << Value; } } + + // If -miamcu is set then set flags to support MCU ABI and use MCU triple. + if (Args.getLastArg(options::OPT_miamcu)) { + if (getToolChain().getArch() != llvm::Triple::x86) + getToolChain().getDriver().Diag(diag::err_drv_unsupported_opt) + << "-miamcu"; + CmdArgs.push_back("-triple"); + CmdArgs.push_back("i586-intel-elfiamcu"); + CmdArgs.push_back("-mfloat-abi"); + CmdArgs.push_back("soft"); + CmdArgs.push_back("-mstack-alignment=4"); + } } void Clang::AddHexagonTargetArgs(const ArgList &Args, @@ -5982,6 +5994,18 @@ CmdArgs.push_back(ABIName.data()); } +void ClangAs::AddX86TargetArgs(const ArgList &Args, + ArgStringList &CmdArgs) const { + // If -miamcu is set then use MCU triple. + if (Args.getLastArg(options::OPT_miamcu)) { + if (getToolChain().getArch() != llvm::Triple::x86) + getToolChain().getDriver().Diag(diag::err_drv_unsupported_opt) + << "-miamcu"; + CmdArgs.push_back("-triple"); + CmdArgs.push_back("i586-intel-elfiamcu"); + } +} + void ClangAs::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, @@ -6123,6 +6147,11 @@ case llvm::Triple::mips64el: AddMIPSTargetArgs(Args, CmdArgs); break; + + case llvm::Triple::x86: + case llvm::Triple::x86_64: + AddX86TargetArgs(Args, CmdArgs); + break; } // Consume all the warning flags. Usually this would be handled more Index: tools/clang/test/CodeGen/attr-target-x86.c =================================================================== --- tools/clang/test/CodeGen/attr-target-x86.c +++ tools/clang/test/CodeGen/attr-target-x86.c @@ -18,6 +18,8 @@ int __attribute__((target("no-mmx"))) qq(int a) { return 40; } +int __attribute__((target("arch=lakemont"))) lake(int a) { return 4; } + // Check that we emit the additional subtarget and cpu features for foo and not for baz or bar. // CHECK: baz{{.*}} #0 // CHECK: foo{{.*}} #1 @@ -31,9 +33,11 @@ // CHECK: qux{{.*}} #1 // CHECK: qax{{.*}} #4 // CHECK: qq{{.*}} #5 +// CHECK: lake{{.*}} #6 // CHECK: #0 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt" // CHECK: #2 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,-aes,-avx,-avx2,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-f16c,-fma,-fma4,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-xop,-xsave,-xsaveopt" // CHECK: #3 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" // CHECK: #4 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-aes" // CHECK: #5 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+sse,+sse2,-3dnow,-3dnowa,-mmx" +// CHECK: #6 = {{.*}}"target-cpu"="lakemont" "target-features"="+mmx,+sse,+sse2" Index: tools/clang/test/Preprocessor/predefined-arch-macros.c =================================================================== --- tools/clang/test/Preprocessor/predefined-arch-macros.c +++ tools/clang/test/Preprocessor/predefined-arch-macros.c @@ -1000,6 +1000,20 @@ // CHECK_SLM_M64: #define __x86_64 1 // CHECK_SLM_M64: #define __x86_64__ 1 // +// RUN: %clang -march=lakemont -m32 -E -dM %s -o - 2>&1 \ +// RUN: -target i386-unknown-linux \ +// RUN: | FileCheck %s -check-prefix=CHECK_LMT_M32 +// CHECK_LMT_M32: #define __i386 1 +// CHECK_LMT_M32: #define __i386__ 1 +// CHECK_LMT_M32: #define __iamcu 1 +// CHECK_LMT_M32: #define __iamcu__ 1 +// CHECK_LMT_M32: #define __tune_lakemont__ 1 +// CHECK_LMT_M32: #define i386 1 +// RUN: not %clang -march=lakemont -m64 -E -dM %s -o - 2>&1 \ +// RUN: -target i386-unknown-linux \ +// RUN: | FileCheck %s -check-prefix=CHECK_LMT_M64 +// CHECK_LMT_M64: error: +// // RUN: %clang -march=geode -m32 -E -dM %s -o - 2>&1 \ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck %s -check-prefix=CHECK_GEODE_M32