diff --git a/llvm/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -130,6 +130,8 @@ Intel CPUs. This tries to limit the use of 512-bit registers which can cause a decrease in CPU frequency on these CPUs. This can be re-enabled by passing -mprefer-vector-width=512 to clang or passing -mattr=-prefer-256-bit to llc. +* The hidden command line flag -x86-use-vzeroupper is now a target attribute: + omit-vzeroupper Changes to the AMDGPU Target ----------------------------- diff --git a/llvm/lib/Target/X86/X86.td b/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -310,6 +310,11 @@ : SubtargetFeature<"fast-partial-ymm-or-zmm-write", "HasFastPartialYMMorZMMWrite", "true", "Partial writes to YMM/ZMM registers are fast">; +// Disable automatic VZEROUPPER emission. +def FeatureOmitVZEROUPPER + : SubtargetFeature<"omit-vzeroupper", "OmitVZEROUPPER", "true", + "Omit VZEROUPPER instruction emission">; + // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if // vector FSQRT has higher throughput than the corresponding NR code. @@ -808,6 +813,7 @@ FeaturePreferMaskRegisters, FeatureSlowTwoMemOps, FeatureFastPartialYMMorZMMWrite, + FeatureOmitVZEROUPPER, FeatureHasFastGather, FeatureSlowPMADDWD]; // TODO Add AVX5124FMAPS/AVX5124VNNIW features @@ -864,6 +870,7 @@ list BtVer2SpecificFeatures = [FeatureFastLZCNT, FeatureFastBEXTR, FeatureFastPartialYMMorZMMWrite, + FeatureOmitVZEROUPPER, FeatureFastHorizontalOps]; list BtVer2InheritableFeatures = !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -260,6 +260,9 @@ /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite = false; + /// True if the VZEROUPPER instruction should be omitted. + bool OmitVZEROUPPER = false; + /// True if there is no performance penalty for writing NOPs with up to /// 11 bytes. bool HasFast11ByteNOP = false; @@ -661,6 +664,7 @@ bool hasFastPartialYMMorZMMWrite() const { return HasFastPartialYMMorZMMWrite; } + bool omitVZEROUPPER() const { return OmitVZEROUPPER; } bool hasFastGather() const { return HasFastGather; } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -315,14 +315,6 @@ return I.get(); } -//===----------------------------------------------------------------------===// -// Command line options for x86 -//===----------------------------------------------------------------------===// -static cl::opt -UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, - cl::desc("Minimize AVX to SSE transition penalty"), - cl::init(true)); - //===----------------------------------------------------------------------===// // X86 TTI query. //===----------------------------------------------------------------------===// @@ -499,8 +491,7 @@ addPass(createX86IndirectBranchTrackingPass()); - if (UseVZeroUpper) - addPass(createX86IssueVZeroUpperPass()); + addPass(createX86IssueVZeroUpperPass()); if (getOptLevel() != CodeGenOpt::None) { addPass(createX86FixupBWInsts()); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -52,6 +52,7 @@ X86::FeatureFastHorizontalOps, X86::FeatureFastLZCNT, X86::FeatureFastPartialYMMorZMMWrite, + X86::FeatureOmitVZEROUPPER, X86::FeatureFastScalarFSQRT, X86::FeatureFastSHLDRotate, X86::FeatureFastScalarShiftMasks, diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -279,7 +279,7 @@ /// function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget(); - if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite()) + if (!ST.hasAVX() || ST.omitVZEROUPPER()) return false; TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); diff --git a/llvm/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll --- a/llvm/test/CodeGen/X86/avx-vzeroupper.ll +++ b/test/CodeGen/X86/avx-vzeroupper.ll @@ -1,9 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512 -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BDVER2 -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,VZ,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,VZ,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=ALL,NO-VZ,BDVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefixes=ALL,NO-VZ,BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefixes=ALL,VZ,FAST-ymm-zmm +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+omit-vzeroupper | FileCheck %s --check-prefixes=ALL,NO-VZ,AVX-NO-VZ declare i32 @foo() declare <4 x float> @do_sse(<4 x float>) @@ -44,19 +45,6 @@ ; VZ-NEXT: addq $56, %rsp ; VZ-NEXT: retq ; -; FAST-ymm-zmm-LABEL: test01: -; FAST-ymm-zmm: # %bb.0: -; FAST-ymm-zmm-NEXT: subq $56, %rsp -; FAST-ymm-zmm-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; FAST-ymm-zmm-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; FAST-ymm-zmm-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; FAST-ymm-zmm-NEXT: addq $56, %rsp -; FAST-ymm-zmm-NEXT: retq -; ; BDVER2-LABEL: test01: ; BDVER2: # %bb.0: ; BDVER2-NEXT: subq $56, %rsp @@ -83,6 +71,19 @@ ; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; BTVER2-NEXT: addq $56, %rsp ; BTVER2-NEXT: retq +; +; AVX-NO-VZ-LABEL: test01: +; AVX-NO-VZ: # %bb.0: +; AVX-NO-VZ-NEXT: subq $56, %rsp +; AVX-NO-VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX-NO-VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; AVX-NO-VZ-NEXT: callq do_sse +; AVX-NO-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; AVX-NO-VZ-NEXT: callq do_sse +; AVX-NO-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; AVX-NO-VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NO-VZ-NEXT: addq $56, %rsp +; AVX-NO-VZ-NEXT: retq %tmp = load <4 x float>, <4 x float>* @x, align 16 %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind store <4 x float> %call, <4 x float>* @x, align 16 @@ -100,11 +101,6 @@ ; VZ-NEXT: vzeroupper ; VZ-NEXT: jmp do_sse # TAILCALL ; -; FAST-ymm-zmm-LABEL: test02: -; FAST-ymm-zmm: # %bb.0: -; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; FAST-ymm-zmm-NEXT: jmp do_sse # TAILCALL -; ; BDVER2-LABEL: test02: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 @@ -115,6 +111,11 @@ ; BTVER2: # %bb.0: ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: jmp do_sse # TAILCALL +; +; AVX-NO-VZ-LABEL: test02: +; AVX-NO-VZ: # %bb.0: +; AVX-NO-VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NO-VZ-NEXT: jmp do_sse # TAILCALL %add.i = fadd <8 x float> %a, %b %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0) %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind @@ -154,35 +155,6 @@ ; VZ-NEXT: popq %rbx ; VZ-NEXT: retq ; -; FAST-ymm-zmm-LABEL: test03: -; FAST-ymm-zmm: # %bb.0: # %entry -; FAST-ymm-zmm-NEXT: pushq %rbx -; FAST-ymm-zmm-NEXT: subq $16, %rsp -; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; FAST-ymm-zmm-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; FAST-ymm-zmm-NEXT: .p2align 4, 0x90 -; FAST-ymm-zmm-NEXT: .LBB3_1: # %while.cond -; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 -; FAST-ymm-zmm-NEXT: callq foo -; FAST-ymm-zmm-NEXT: testl %eax, %eax -; FAST-ymm-zmm-NEXT: jne .LBB3_1 -; FAST-ymm-zmm-NEXT: # %bb.2: # %for.body.preheader -; FAST-ymm-zmm-NEXT: movl $4, %ebx -; FAST-ymm-zmm-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; FAST-ymm-zmm-NEXT: .p2align 4, 0x90 -; FAST-ymm-zmm-NEXT: .LBB3_3: # %for.body -; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: decl %ebx -; FAST-ymm-zmm-NEXT: jne .LBB3_3 -; FAST-ymm-zmm-NEXT: # %bb.4: # %for.end -; FAST-ymm-zmm-NEXT: addq $16, %rsp -; FAST-ymm-zmm-NEXT: popq %rbx -; FAST-ymm-zmm-NEXT: retq -; ; BDVER2-LABEL: test03: ; BDVER2: # %bb.0: # %entry ; BDVER2-NEXT: pushq %rbx @@ -240,6 +212,35 @@ ; BTVER2-NEXT: addq $16, %rsp ; BTVER2-NEXT: popq %rbx ; BTVER2-NEXT: retq +; +; AVX-NO-VZ-LABEL: test03: +; AVX-NO-VZ: # %bb.0: # %entry +; AVX-NO-VZ-NEXT: pushq %rbx +; AVX-NO-VZ-NEXT: subq $16, %rsp +; AVX-NO-VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NO-VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NO-VZ-NEXT: .p2align 4, 0x90 +; AVX-NO-VZ-NEXT: .LBB3_1: # %while.cond +; AVX-NO-VZ-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NO-VZ-NEXT: callq foo +; AVX-NO-VZ-NEXT: testl %eax, %eax +; AVX-NO-VZ-NEXT: jne .LBB3_1 +; AVX-NO-VZ-NEXT: # %bb.2: # %for.body.preheader +; AVX-NO-VZ-NEXT: movl $4, %ebx +; AVX-NO-VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NO-VZ-NEXT: .p2align 4, 0x90 +; AVX-NO-VZ-NEXT: .LBB3_3: # %for.body +; AVX-NO-VZ-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NO-VZ-NEXT: callq do_sse +; AVX-NO-VZ-NEXT: callq do_sse +; AVX-NO-VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; AVX-NO-VZ-NEXT: callq do_sse +; AVX-NO-VZ-NEXT: decl %ebx +; AVX-NO-VZ-NEXT: jne .LBB3_3 +; AVX-NO-VZ-NEXT: # %bb.4: # %for.end +; AVX-NO-VZ-NEXT: addq $16, %rsp +; AVX-NO-VZ-NEXT: popq %rbx +; AVX-NO-VZ-NEXT: retq entry: %add.i = fadd <4 x float> %a, %b br label %while.cond @@ -279,16 +280,6 @@ ; VZ-NEXT: vzeroupper ; VZ-NEXT: retq ; -; FAST-ymm-zmm-LABEL: test04: -; FAST-ymm-zmm: # %bb.0: -; FAST-ymm-zmm-NEXT: pushq %rax -; FAST-ymm-zmm-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; FAST-ymm-zmm-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; FAST-ymm-zmm-NEXT: callq do_avx -; FAST-ymm-zmm-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; FAST-ymm-zmm-NEXT: popq %rax -; FAST-ymm-zmm-NEXT: retq -; ; BDVER2-LABEL: test04: ; BDVER2: # %bb.0: ; BDVER2-NEXT: pushq %rax @@ -309,6 +300,16 @@ ; BTVER2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; BTVER2-NEXT: popq %rax ; BTVER2-NEXT: retq +; +; AVX-NO-VZ-LABEL: test04: +; AVX-NO-VZ: # %bb.0: +; AVX-NO-VZ-NEXT: pushq %rax +; AVX-NO-VZ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX-NO-VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NO-VZ-NEXT: callq do_avx +; AVX-NO-VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NO-VZ-NEXT: popq %rax +; AVX-NO-VZ-NEXT: retq %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32>