Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -79,16 +79,12 @@ "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; -// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that -// explicit. Also, it seems this would be the default state for most chips -// going forward, so it would probably be better to negate the logic and -// match the 32-byte "slow mem" feature below. -def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", - "IsUAMemFast", "true", - "Fast unaligned memory access">; +def FeatureSlowUAMem : SubtargetFeature<"slow-unaligned-mem-under-32", + "IsUAMemUnder32Slow", "true", + "Slow unaligned 16-byte-or-less memory access">; def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", - "IsUAMem32Slow", "true", - "Slow unaligned 32-byte memory access">; + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", [FeatureSSE3]>; @@ -213,38 +209,42 @@ class Proc Features> : ProcessorModel; -def : Proc<"generic", []>; -def : Proc<"i386", []>; -def : Proc<"i486", []>; -def : Proc<"i586", []>; -def : Proc<"pentium", []>; -def : Proc<"pentium-mmx", [FeatureMMX]>; -def : Proc<"i686", []>; -def : Proc<"pentiumpro", [FeatureCMOV]>; -def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; -def : Proc<"pentium3", [FeatureSSE1]>; -def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; -def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; -def : Proc<"pentium4", [FeatureSSE2]>; -def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"generic", [FeatureSlowUAMem]>; +def : Proc<"i386", [FeatureSlowUAMem]>; +def : Proc<"i486", [FeatureSlowUAMem]>; +def : Proc<"i586", [FeatureSlowUAMem]>; +def : Proc<"pentium", [FeatureSlowUAMem]>; +def : Proc<"pentium-mmx", [FeatureSlowUAMem, FeatureMMX]>; +def : Proc<"i686", [FeatureSlowUAMem]>; +def : Proc<"pentiumpro", [FeatureSlowUAMem, FeatureCMOV]>; +def : Proc<"pentium2", [FeatureSlowUAMem, FeatureMMX, FeatureCMOV]>; +def : Proc<"pentium3", [FeatureSlowUAMem, FeatureSSE1]>; +def : Proc<"pentium3m", [FeatureSlowUAMem, FeatureSSE1, FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureSlowUAMem, FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureSlowUAMem, FeatureSSE2]>; +def : Proc<"pentium4m", [FeatureSlowUAMem, FeatureSSE2, FeatureSlowBTMem]>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureSSE3, FeatureSlowBTMem]>; + [FeatureSlowUAMem, FeatureSSE3, FeatureSlowBTMem]>; // NetBurst. -def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : Proc<"prescott", [FeatureSlowUAMem, FeatureSSE3, FeatureSlowBTMem]>; +def : Proc<"nocona", [FeatureSlowUAMem, FeatureSSE3, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; // Intel Core 2 Solo/Duo. def : ProcessorModel<"core2", SandyBridgeModel, - [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; + [FeatureSlowUAMem, FeatureSSSE3, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; def : ProcessorModel<"penryn", SandyBridgeModel, - [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; + [FeatureSlowUAMem, FeatureSSE41, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; // Atom CPUs. class BonnellProc : ProcessorModel; def : SilvermontProc<"silvermont">; def : SilvermontProc<"slm">; // Legacy alias. @@ -283,7 +282,6 @@ FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT ]>; def : NehalemProc<"nehalem">; @@ -295,7 +293,6 @@ FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL @@ -308,7 +305,6 @@ FeatureAVX, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES, @@ -321,7 +317,6 @@ FeatureAVX, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES, @@ -337,7 +332,6 @@ FeatureAVX2, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, @@ -360,7 +354,6 @@ FeatureAVX2, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, @@ -383,7 +376,7 @@ // FIXME: define KNL model class KnightsLandingProc : ProcessorModel : ProcessorModel; -def : Proc<"k6-2", [Feature3DNow]>; -def : Proc<"k6-3", [Feature3DNow]>; -def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit, +def : Proc<"k6", [FeatureSlowUAMem, FeatureMMX]>; +def : Proc<"k6-2", [FeatureSlowUAMem, Feature3DNow]>; +def : Proc<"k6-3", [FeatureSlowUAMem, Feature3DNow]>; +def : Proc<"athlon", [FeatureSlowUAMem, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit, +def : Proc<"athlon-tbird", [FeatureSlowUAMem, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit, +def : Proc<"athlon-4", [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit, +def : Proc<"athlon-xp", [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, +def : Proc<"athlon-mp", [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"amdfam10", [FeatureSSE4A, +def : Proc<"k8", [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA, + Feature64Bit, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"opteron", [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA, + Feature64Bit, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon64", [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA, + Feature64Bit, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon-fx", [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA, + Feature64Bit, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"k8-sse3", [FeatureSlowUAMem, FeatureSSE3, Feature3DNowA, + FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"opteron-sse3", [FeatureSlowUAMem, FeatureSSE3, Feature3DNowA, + FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon64-sse3", [FeatureSlowUAMem, FeatureSSE3, Feature3DNowA, + FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"amdfam10", [FeatureSlowUAMem, FeatureSSE4A, Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"barcelona", [FeatureSSE4A, +def : Proc<"barcelona", [FeatureSlowUAMem, FeatureSSE4A, Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD]>; + +// FIXME: We should remove 'FeatureSlowUAMem' from AMD chips under here. + // Bobcat def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, - FeatureSlowSHLD]>; + FeatureSlowSHLD, FeatureSlowUAMem]>; // Jaguar def : ProcessorModel<"btver2", BtVer2Model, [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureAES, FeaturePCLMUL, FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, + FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; -// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. - // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, FeatureAVX, FeatureSSE4A, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowSHLD]>; + FeaturePOPCNT, FeatureSlowSHLD, + FeatureSlowUAMem]>; // Piledriver def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, FeatureAVX, FeatureSSE4A, FeatureF16C, FeatureLZCNT, FeaturePOPCNT, FeatureBMI, - FeatureTBM, FeatureFMA, FeatureSlowSHLD]>; + FeatureTBM, FeatureFMA, FeatureSlowSHLD, + FeatureSlowUAMem]>; // Steamroller def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, @@ -474,7 +477,7 @@ FeatureAVX, FeatureSSE4A, FeatureF16C, FeatureLZCNT, FeaturePOPCNT, FeatureBMI, FeatureTBM, FeatureFMA, FeatureSlowSHLD, - FeatureFSGSBase]>; + FeatureFSGSBase, FeatureSlowUAMem]>; // Excavator def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4, @@ -482,14 +485,14 @@ FeaturePCLMUL, FeatureF16C, FeatureLZCNT, FeaturePOPCNT, FeatureBMI, FeatureBMI2, FeatureTBM, FeatureFMA, FeatureSSE4A, - FeatureFSGSBase]>; + FeatureFSGSBase, FeatureSlowUAMem]>; -def : Proc<"geode", [Feature3DNowA]>; +def : Proc<"geode", [FeatureSlowUAMem, Feature3DNowA]>; -def : Proc<"winchip-c6", [FeatureMMX]>; -def : Proc<"winchip2", [Feature3DNow]>; -def : Proc<"c3", [Feature3DNow]>; -def : Proc<"c3-2", [FeatureSSE1]>; +def : Proc<"winchip-c6", [FeatureSlowUAMem, FeatureMMX]>; +def : Proc<"winchip2", [FeatureSlowUAMem, Feature3DNow]>; +def : Proc<"c3", [FeatureSlowUAMem, Feature3DNow]>; +def : Proc<"c3-2", [FeatureSlowUAMem, FeatureSSE1]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -502,8 +505,7 @@ // knobs which need to be tuned differently for AMD chips, we might consider // forming a common base for them. def : ProcessorModel<"x86-64", SandyBridgeModel, - [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, - FeatureFastUAMem]>; + [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; //===----------------------------------------------------------------------===// // Register File Description Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1876,10 +1876,11 @@ if ((!IsMemset || ZeroMemset) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && - (Subtarget->isUnalignedMemAccessFast() || + (!Subtarget->isUnalignedMemUnder32Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { + // FIXME: Check if unaligned 32-byte accesses are slow. if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) @@ -1897,6 +1898,9 @@ return MVT::f64; } } + // This is a compromise. If we reach here, unaligned accesses may be slow on + // this target. However, creating smaller, aligned accesses could be even + // slower and would certainly be a lot more code. if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; @@ -1916,12 +1920,10 @@ unsigned, bool *Fast) const { if (Fast) { - // FIXME: We should be checking 128-bit accesses separately from smaller - // accesses. if (VT.getSizeInBits() == 256) *Fast = !Subtarget->isUnalignedMem32Slow(); else - *Fast = Subtarget->isUnalignedMemAccessFast(); + *Fast = !Subtarget->isUnalignedMemUnder32Slow(); } return true; } Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -5508,9 +5508,10 @@ const MCInstrDesc &MCID = get(Opc); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + // TODO: Check if 32-byte or greater accesses are slow too? if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMemUnder32Slow()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. @@ -5658,9 +5659,11 @@ cast(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMemUnder32Slow()) // Do not introduce a slow unaligned load. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -5701,9 +5704,11 @@ cast(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMemUnder32Slow()) // Do not introduce a slow unaligned store. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -146,10 +146,10 @@ /// True if SHLD instructions are slow. bool IsSHLDSlow; - /// True if unaligned memory access is fast. - bool IsUAMemFast; + /// True if unaligned memory accesses of 16-bytes or smaller are slow. + bool IsUAMemUnder32Slow; - /// True if unaligned 32-byte memory accesses are slow. + /// True if unaligned memory accesses of 32-bytes are slow. bool IsUAMem32Slow; /// True if SSE operations can have unaligned memory operands. @@ -357,7 +357,7 @@ bool hasRDSEED() const { return HasRDSEED; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } - bool isUnalignedMemAccessFast() const { return IsUAMemFast; } + bool isUnalignedMemUnder32Slow() const { return IsUAMemUnder32Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp @@ -255,7 +255,7 @@ HasMPX = false; IsBTMemSlow = false; IsSHLDSlow = false; - IsUAMemFast = false; + IsUAMemUnder32Slow = false; IsUAMem32Slow = false; HasSSEUnalignedMem = false; HasCmpxchg16b = false;