Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -239,6 +239,11 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; +// On at least some AMD processors, there is no performance hazard to writing +// only the lower parts of a YMM register without clearing the upper part. +def FeatureFastPartialYMMWrite + : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite", + "true", "Partial writes to YMM registers are fast">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -596,7 +601,8 @@ FeatureXSAVE, FeatureXSAVEOPT, FeatureSlowSHLD, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureFastPartialYMMWrite ]>; // Bulldozer Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -189,6 +189,10 @@ /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; + /// True if there is no performance penalty to writing only the lower parts + /// of a YMM register without clearing the upper part. + bool HasFastPartialYMMWrite; + /// True if 8-bit divisions are significantly faster than /// 32-bit divisions and should be used when possible. bool HasSlowDivide32; @@ -421,6 +425,7 @@ bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } + bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp @@ -285,6 +285,7 @@ HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; + HasFastPartialYMMWrite = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; Index: llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp +++ llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp @@ -248,7 +248,7 @@ /// vzeroupper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget(); - if (!ST.hasAVX() || ST.hasAVX512()) + if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite()) return false; TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); Index: llvm/trunk/test/CodeGen/X86/avx-vzeroupper.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-vzeroupper.ll +++ llvm/trunk/test/CodeGen/X86/avx-vzeroupper.ll @@ -1,4 +1,9 @@ ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-write | FileCheck --check-prefix=FASTYMM %s +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s + +; FASTYMM-NOT: vzeroupper +; BTVER2-NOT: vzeroupper declare i32 @foo() declare <4 x float> @do_sse(<4 x float>)