diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -93,6 +93,10 @@ Clang. Setting the version to zero causes Clang to leave ``__GNUC__`` and other GNU-namespaced macros, such as ``__GXX_WEAK__``, undefined. +- vzeroupper insertion on X86 targets can now be disabled with -mno-vzeroupper. + You can also force vzeroupper insertion to be used on CPUs that normally + wouldn't with -mvzeroupper. + Deprecated Compiler Flags ------------------------- diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3126,6 +3126,8 @@ def mno_shstk : Flag<["-"], "mno-shstk">, Group; def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group; def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group; +def mvzeroupper : Flag<["-"], "mvzeroupper">, Group; +def mno_vzeroupper : Flag<["-"], "mno-vzeroupper">, Group; // These are legacy user-facing driver-level option spellings. They are always // aliases for options that are spelled using the more common Unix / GNU flag diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -193,3 +193,8 @@ // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-enqcmd %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-ENQCMD %s // ENQCMD: "-target-feature" "+enqcmd" // NO-ENQCMD: "-target-feature" "-enqcmd" + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mvzeroupper %s -### -o %t.o 2>&1 | FileCheck --check-prefix=VZEROUPPER %s +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-vzeroupper %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-VZEROUPPER %s +// VZEROUPPER: "-target-feature" "+vzeroupper" +// NO-VZEROUPPER: "-target-feature" "-vzeroupper" diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -134,6 +134,13 @@ Intel CPUs. This tries to limit the use of 512-bit registers which can cause a decrease in CPU frequency on these CPUs. This can be re-enabled by passing -mprefer-vector-width=512 to clang or passing -mattr=-prefer-256-bit to llc. +* Deprecated the mpx feature flag for the Intel MPX instructions. There were no + intrinsics for this feature. This change only this effects the results + returned by getHostCPUFeatures on CPUs that implement the MPX instructions. +* The feature flag fast-partial-ymm-or-zmm-write which previously disabled + vzeroupper insertion has been removed. It has been replaced with a vzeroupper + feature flag which has the opposite polarity. So -vzeroupper has the same + effect as +fast-partial-ymm-or-zmm-write. Changes to the AMDGPU Target ----------------------------- @@ -143,10 +150,6 @@ During this release ... -* Deprecated the mpx feature flag for the Intel MPX instructions. There were no - intrinsics for this feature. This change only this effects the results - returned by getHostCPUFeatures on CPUs that implement the MPX instructions. - Changes to the WebAssembly Target --------------------------------- diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -304,12 +304,12 @@ : SubtargetFeature<"fast-variable-shuffle", "HasFastVariableShuffle", "true", "Shuffles with variable masks are fast">; -// On some X86 processors, there is no performance hazard to writing only the -// lower parts of a YMM or ZMM register without clearing the upper part. -def FeatureFastPartialYMMorZMMWrite - : SubtargetFeature<"fast-partial-ymm-or-zmm-write", - "HasFastPartialYMMorZMMWrite", - "true", "Partial writes to YMM/ZMM registers are fast">; +// On some X86 processors, a vzeroupper instruction should be inserted after +// using ymm/zmm registers before executing code that may use SSE instructions. +def FeatureInsertVZEROUPPER + : SubtargetFeature<"vzeroupper", + "InsertVZEROUPPER", + "true", "Should insert vzeroupper instructions">; // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if // vector FSQRT has higher throughput than the corresponding NR code. @@ -525,7 +525,8 @@ FeatureCMPXCHG16B, FeaturePOPCNT, FeatureLAHFSAHF, - FeatureMacroFusion]; + FeatureMacroFusion, + FeatureInsertVZEROUPPER]; list NHMSpecificFeatures = []; list NHMFeatures = !listconcat(NHMInheritableFeatures, NHMSpecificFeatures); @@ -705,7 +706,8 @@ FeatureCMPXCHG16B, FeatureMOVBE, FeatureSlowTwoMemOps, - FeatureLAHFSAHF]; + FeatureLAHFSAHF, + FeatureInsertVZEROUPPER]; list AtomSpecificFeatures = [ProcIntelAtom, FeatureSlowUAMem16, FeatureLEAForSP, @@ -807,7 +809,6 @@ FeaturePRFCHW, FeaturePreferMaskRegisters, FeatureSlowTwoMemOps, - FeatureFastPartialYMMorZMMWrite, FeatureHasFastGather, FeatureSlowPMADDWD]; // TODO Add AVX5124FMAPS/AVX5124VNNIW features @@ -828,7 +829,8 @@ FeatureLAHFSAHF, FeatureCMOV, Feature64Bit, - FeatureFastScalarShiftMasks]; + FeatureFastScalarShiftMasks, + FeatureInsertVZEROUPPER]; list BarcelonaFeatures = BarcelonaInheritableFeatures; // Bobcat @@ -850,7 +852,9 @@ FeatureFast15ByteNOP, FeatureFastScalarShiftMasks, FeatureFastVectorShiftMasks]; - list BtVer1Features = BtVer1InheritableFeatures; + list BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER]; + list BtVer1Features = + !listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures); // Jaguar list BtVer2AdditionalFeatures = [FeatureAVX, @@ -863,7 +867,6 @@ FeatureXSAVEOPT]; list BtVer2SpecificFeatures = [FeatureFastLZCNT, FeatureFastBEXTR, - FeatureFastPartialYMMorZMMWrite, FeatureFastHorizontalOps]; list BtVer2InheritableFeatures = !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures); @@ -891,7 +894,8 @@ FeatureLAHFSAHF, FeatureFast11ByteNOP, FeatureFastScalarShiftMasks, - FeatureBranchFusion]; + FeatureBranchFusion, + FeatureInsertVZEROUPPER]; list BdVer1Features = BdVer1InheritableFeatures; // PileDriver @@ -954,6 +958,7 @@ FeatureSHA, FeatureSSE4A, FeatureSlowSHLD, + FeatureInsertVZEROUPPER, FeatureX87, FeatureXSAVE, FeatureXSAVEC, @@ -976,28 +981,32 @@ // NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled // if i386/i486 is specifically requested. def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B]>; -def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>; + FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; +def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16, + FeatureInsertVZEROUPPER]>; +def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16, + FeatureInsertVZEROUPPER]>; def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B]>; + FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B]>; + FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B, FeatureMMX]>; + FeatureCMPXCHG8B, FeatureMMX, + FeatureInsertVZEROUPPER]>; def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureCMOV, FeatureNOPL]>; + FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>; def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV, FeatureFXSR, - FeatureNOPL]>; + FeatureNOPL, FeatureInsertVZEROUPPER]>; foreach P = ["pentium3", "pentium3m"] in { def : Proc; + FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV, + FeatureInsertVZEROUPPER]>; } // Enable the PostRAScheduler for SSE2 and SSE3 class cpus. @@ -1013,29 +1022,29 @@ def : ProcessorModel<"pentium-m", GenericPostRAModel, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; foreach P = ["pentium4", "pentium4m"] in { def : ProcessorModel; + FeatureCMOV, FeatureInsertVZEROUPPER]>; } // Intel Quark. -def : Proc<"lakemont", []>; +def : Proc<"lakemont", [FeatureInsertVZEROUPPER]>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; // NetBurst. def : ProcessorModel<"prescott", GenericPostRAModel, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureX87, FeatureSlowUAMem16, @@ -1046,7 +1055,8 @@ FeatureFXSR, FeatureNOPL, Feature64Bit, - FeatureCMPXCHG16B + FeatureCMPXCHG16B, + FeatureInsertVZEROUPPER ]>; // Intel Core 2 Solo/Duo. @@ -1062,7 +1072,8 @@ Feature64Bit, FeatureCMPXCHG16B, FeatureLAHFSAHF, - FeatureMacroFusion + FeatureMacroFusion, + FeatureInsertVZEROUPPER ]>; def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureX87, @@ -1076,7 +1087,8 @@ Feature64Bit, FeatureCMPXCHG16B, FeatureLAHFSAHF, - FeatureMacroFusion + FeatureMacroFusion, + FeatureInsertVZEROUPPER ]>; // Atom CPUs. @@ -1143,35 +1155,36 @@ // AMD CPUs. def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureMMX]>; + FeatureMMX, FeatureInsertVZEROUPPER]>; def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNow]>; + Feature3DNow, FeatureInsertVZEROUPPER]>; def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNow]>; + Feature3DNow, FeatureInsertVZEROUPPER]>; foreach P = ["athlon", "athlon-tbird"] in { def : Proc; + Feature3DNowA, FeatureNOPL, FeatureSlowSHLD, + FeatureInsertVZEROUPPER]>; } foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { def : Proc; + FeatureSlowSHLD, FeatureInsertVZEROUPPER]>; } foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { def : Proc; + FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { def : Proc; + FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>; } foreach P = ["amdfam10", "barcelona"] in { @@ -1196,14 +1209,17 @@ def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>; def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNowA]>; - -def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; -def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; + Feature3DNowA, FeatureInsertVZEROUPPER]>; + +def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureInsertVZEROUPPER]>; +def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow, + FeatureInsertVZEROUPPER]>; +def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow, + FeatureInsertVZEROUPPER]>; def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE1, FeatureFXSR, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -1226,7 +1242,8 @@ Feature64Bit, FeatureSlow3OpsLEA, FeatureSlowIncDec, - FeatureMacroFusion + FeatureMacroFusion, + FeatureInsertVZEROUPPER ]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -256,9 +256,9 @@ /// mask over multiple fixed shuffles. bool HasFastVariableShuffle = false; - /// True if there is no performance penalty to writing only the lower parts - /// of a YMM or ZMM register without clearing the upper part. - bool HasFastPartialYMMorZMMWrite = false; + /// True if vzeroupper instructions should be inserted after code that uses + /// ymm or zmm registers. + bool InsertVZEROUPPER = false; /// True if there is no performance penalty for writing NOPs with up to /// 11 bytes. @@ -658,9 +658,7 @@ bool hasFastVariableShuffle() const { return HasFastVariableShuffle; } - bool hasFastPartialYMMorZMMWrite() const { - return HasFastPartialYMMorZMMWrite; - } + bool insertVZEROUPPER() const { return InsertVZEROUPPER; } bool hasFastGather() const { return HasFastGather; } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -51,7 +51,6 @@ X86::FeatureFastBEXTR, X86::FeatureFastHorizontalOps, X86::FeatureFastLZCNT, - X86::FeatureFastPartialYMMorZMMWrite, X86::FeatureFastScalarFSQRT, X86::FeatureFastSHLDRotate, X86::FeatureFastScalarShiftMasks, @@ -78,6 +77,7 @@ X86::FeatureSlowTwoMemOps, X86::FeatureSlowUAMem16, X86::FeaturePreferMaskRegisters, + X86::FeatureInsertVZEROUPPER, // Perf-tuning flags. X86::FeatureHasFastGather, diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -279,7 +279,7 @@ /// function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget(); - if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite()) + if (!ST.hasAVX() || !ST.insertVZEROUPPER()) return false; TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); diff --git a/llvm/test/CodeGen/X86/avx-vzeroupper.ll b/llvm/test/CodeGen/X86/avx-vzeroupper.ll --- a/llvm/test/CodeGen/X86/avx-vzeroupper.ll +++ b/llvm/test/CodeGen/X86/avx-vzeroupper.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512 -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,-vzeroupper | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=DISABLE-VZ ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BDVER2 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2 @@ -44,18 +44,18 @@ ; VZ-NEXT: addq $56, %rsp ; VZ-NEXT: retq ; -; FAST-ymm-zmm-LABEL: test01: -; FAST-ymm-zmm: # %bb.0: -; FAST-ymm-zmm-NEXT: subq $56, %rsp -; FAST-ymm-zmm-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; FAST-ymm-zmm-NEXT: vmovaps {{.*}}(%rip), %xmm0 -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip) -; FAST-ymm-zmm-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; FAST-ymm-zmm-NEXT: addq $56, %rsp -; FAST-ymm-zmm-NEXT: retq +; DISABLE-VZ-LABEL: test01: +; DISABLE-VZ: # %bb.0: +; DISABLE-VZ-NEXT: subq $56, %rsp +; DISABLE-VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; DISABLE-VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; DISABLE-VZ-NEXT: callq do_sse +; DISABLE-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; DISABLE-VZ-NEXT: callq do_sse +; DISABLE-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip) +; DISABLE-VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; DISABLE-VZ-NEXT: addq $56, %rsp +; DISABLE-VZ-NEXT: retq ; ; BDVER2-LABEL: test01: ; BDVER2: # %bb.0: @@ -83,6 +83,7 @@ ; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; BTVER2-NEXT: addq $56, %rsp ; BTVER2-NEXT: retq +; DISABLE-VZ # %bb.0: %tmp = load <4 x float>, <4 x float>* @x, align 16 %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind store <4 x float> %call, <4 x float>* @x, align 16 @@ -100,10 +101,10 @@ ; VZ-NEXT: vzeroupper ; VZ-NEXT: jmp do_sse # TAILCALL ; -; FAST-ymm-zmm-LABEL: test02: -; FAST-ymm-zmm: # %bb.0: -; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; FAST-ymm-zmm-NEXT: jmp do_sse # TAILCALL +; DISABLE-VZ-LABEL: test02: +; DISABLE-VZ: # %bb.0: +; DISABLE-VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; DISABLE-VZ-NEXT: jmp do_sse # TAILCALL ; ; BDVER2-LABEL: test02: ; BDVER2: # %bb.0: @@ -154,34 +155,34 @@ ; VZ-NEXT: popq %rbx ; VZ-NEXT: retq ; -; FAST-ymm-zmm-LABEL: test03: -; FAST-ymm-zmm: # %bb.0: # %entry -; FAST-ymm-zmm-NEXT: pushq %rbx -; FAST-ymm-zmm-NEXT: subq $16, %rsp -; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; FAST-ymm-zmm-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; FAST-ymm-zmm-NEXT: .p2align 4, 0x90 -; FAST-ymm-zmm-NEXT: .LBB3_1: # %while.cond -; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 -; FAST-ymm-zmm-NEXT: callq foo -; FAST-ymm-zmm-NEXT: testl %eax, %eax -; FAST-ymm-zmm-NEXT: jne .LBB3_1 -; FAST-ymm-zmm-NEXT: # %bb.2: # %for.body.preheader -; FAST-ymm-zmm-NEXT: movl $4, %ebx -; FAST-ymm-zmm-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; FAST-ymm-zmm-NEXT: .p2align 4, 0x90 -; FAST-ymm-zmm-NEXT: .LBB3_3: # %for.body -; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 -; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: decl %ebx -; FAST-ymm-zmm-NEXT: jne .LBB3_3 -; FAST-ymm-zmm-NEXT: # %bb.4: # %for.end -; FAST-ymm-zmm-NEXT: addq $16, %rsp -; FAST-ymm-zmm-NEXT: popq %rbx -; FAST-ymm-zmm-NEXT: retq +; DISABLE-VZ-LABEL: test03: +; DISABLE-VZ: # %bb.0: # %entry +; DISABLE-VZ-NEXT: pushq %rbx +; DISABLE-VZ-NEXT: subq $16, %rsp +; DISABLE-VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; DISABLE-VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; DISABLE-VZ-NEXT: .p2align 4, 0x90 +; DISABLE-VZ-NEXT: .LBB3_1: # %while.cond +; DISABLE-VZ-NEXT: # =>This Inner Loop Header: Depth=1 +; DISABLE-VZ-NEXT: callq foo +; DISABLE-VZ-NEXT: testl %eax, %eax +; DISABLE-VZ-NEXT: jne .LBB3_1 +; DISABLE-VZ-NEXT: # %bb.2: # %for.body.preheader +; DISABLE-VZ-NEXT: movl $4, %ebx +; DISABLE-VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; DISABLE-VZ-NEXT: .p2align 4, 0x90 +; DISABLE-VZ-NEXT: .LBB3_3: # %for.body +; DISABLE-VZ-NEXT: # =>This Inner Loop Header: Depth=1 +; DISABLE-VZ-NEXT: callq do_sse +; DISABLE-VZ-NEXT: callq do_sse +; DISABLE-VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 +; DISABLE-VZ-NEXT: callq do_sse +; DISABLE-VZ-NEXT: decl %ebx +; DISABLE-VZ-NEXT: jne .LBB3_3 +; DISABLE-VZ-NEXT: # %bb.4: # %for.end +; DISABLE-VZ-NEXT: addq $16, %rsp +; DISABLE-VZ-NEXT: popq %rbx +; DISABLE-VZ-NEXT: retq ; ; BDVER2-LABEL: test03: ; BDVER2: # %bb.0: # %entry @@ -279,15 +280,15 @@ ; VZ-NEXT: vzeroupper ; VZ-NEXT: retq ; -; FAST-ymm-zmm-LABEL: test04: -; FAST-ymm-zmm: # %bb.0: -; FAST-ymm-zmm-NEXT: pushq %rax -; FAST-ymm-zmm-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; FAST-ymm-zmm-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; FAST-ymm-zmm-NEXT: callq do_avx -; FAST-ymm-zmm-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; FAST-ymm-zmm-NEXT: popq %rax -; FAST-ymm-zmm-NEXT: retq +; DISABLE-VZ-LABEL: test04: +; DISABLE-VZ: # %bb.0: +; DISABLE-VZ-NEXT: pushq %rax +; DISABLE-VZ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; DISABLE-VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; DISABLE-VZ-NEXT: callq do_avx +; DISABLE-VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; DISABLE-VZ-NEXT: popq %rax +; DISABLE-VZ-NEXT: retq ; ; BDVER2-LABEL: test04: ; BDVER2: # %bb.0: