Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -238,11 +238,12 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; -// On at least some AMD processors, there is no performance hazard to writing -// only the lower parts of a YMM register without clearing the upper part. -def FeatureFastPartialYMMWrite - : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite", - "true", "Partial writes to YMM registers are fast">; +// On some X86 processors, there is no performance hazard to writing only the +// lower parts of a YMM or ZMM register without clearing the upper part. +def FeatureFastPartialYMMorZMMWrite + : SubtargetFeature<"fast-partial-ymm-or-zmm-write", + "HasFastPartialYMMorZMMWrite", + "true", "Partial writes to YMM/ZMM registers are fast">; // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if // vector FSQRT has higher throughput than the corresponding NR code. @@ -535,7 +536,8 @@ FeatureLZCNT, FeatureBMI, FeatureBMI2, - FeatureFMA + FeatureFMA, + FeatureFastPartialYMMorZMMWrite ]>; def : KnightsLandingProc<"knl">; @@ -649,7 +651,7 @@ FeatureXSAVEOPT, FeatureSlowSHLD, FeatureLAHFSAHF, - FeatureFastPartialYMMWrite + FeatureFastPartialYMMorZMMWrite ]>; // Bulldozer Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -207,8 +207,8 @@ bool UseLeaForSP; /// True if there is no performance penalty to writing only the lower parts - /// of a YMM register without clearing the upper part. - bool HasFastPartialYMMWrite; + /// of a YMM or ZMM register without clearing the upper part. + bool HasFastPartialYMMorZMMWrite; /// True if hardware SQRTSS instruction is at least as fast (latency) as /// RSQRTSS followed by a Newton-Raphson iteration. @@ -462,7 +462,9 @@ bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } - bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; } + bool hasFastPartialYMMorZMMWrite() const { + return HasFastPartialYMMorZMMWrite; + } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -298,7 +298,7 @@ HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; - HasFastPartialYMMWrite = false; + HasFastPartialYMMorZMMWrite = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; HasFastLZCNT = false; Index: lib/Target/X86/X86VZeroUpper.cpp =================================================================== --- lib/Target/X86/X86VZeroUpper.cpp +++ lib/Target/X86/X86VZeroUpper.cpp @@ -56,11 +56,11 @@ // Core algorithm state: // BlockState - Each block is either: - // - PASS_THROUGH: There are neither YMM dirtying instructions nor + // - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor // vzeroupper instructions in this block. // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this - // block that will ensure that YMM is clean on exit. - // - EXITS_DIRTY: An instruction in the block dirties YMM and no + // block that will ensure that YMM/ZMM is clean on exit. + // - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no // subsequent vzeroupper in the block clears it. // // AddedToDirtySuccessors - This flag is raised when a block is added to the @@ -106,51 +106,54 @@ llvm_unreachable("Invalid block exit state."); } -static bool isYmmReg(unsigned Reg) { - return (Reg >= X86::YMM0 && Reg <= X86::YMM15); +/// VZEROUPPER cleans state that is related to Y/ZMM0-15 only. +/// Thus, there is no need to check for Y/ZMM16 and above. +static bool isYmmOrZmmReg(unsigned Reg) { + return (Reg >= X86::YMM0 && Reg <= X86::YMM15) || + (Reg >= X86::ZMM0 && Reg <= X86::ZMM15); } -static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { +static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) { for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) - if (isYmmReg(I->first)) + if (isYmmOrZmmReg(I->first)) return true; return false; } -static bool clobbersAllYmmRegs(const MachineOperand &MO) { +static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) { for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { if (!MO.clobbersPhysReg(reg)) return false; } + for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } return true; } -static bool hasYmmReg(MachineInstr &MI) { +static bool hasYmmOrZmmReg(MachineInstr &MI) { for (const MachineOperand &MO : MI.operands()) { - if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO)) + if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO)) return true; if (!MO.isReg()) continue; if (MO.isDebug()) continue; - if (isYmmReg(MO.getReg())) + if (isYmmOrZmmReg(MO.getReg())) return true; } return false; } -/// Check if any YMM register will be clobbered by this instruction. -static bool callClobbersAnyYmmReg(MachineInstr &MI) { +/// Check if given call instruction has a RegMask operand. +static bool callHasRegMask(MachineInstr &MI) { assert(MI.isCall() && "Can only be called on call instructions."); for (const MachineOperand &MO : MI.operands()) { - if (!MO.isRegMask()) - continue; - for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { - if (MO.clobbersPhysReg(reg)) - return true; - } + if (MO.isRegMask()) + return true; } return false; } @@ -175,17 +178,20 @@ /// Loop over all of the instructions in the basic block, inserting vzeroupper /// instructions before function calls. void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { - // Start by assuming that the block is PASS_THROUGH which implies no unguarded // calls. BlockExitState CurState = PASS_THROUGH; BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); for (MachineInstr &MI : MBB) { + bool IsCall = MI.isCall(); + bool IsReturn = MI.isReturn(); + bool IsControlFlow = IsCall || IsReturn; + // No need for vzeroupper before iret in interrupt handler function, - // epilogue will restore YMM registers if needed. - bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn(); - bool IsControlFlow = MI.isCall() || MI.isReturn(); + // epilogue will restore YMM/ZMM registers if needed. + if (IsX86INTR && IsReturn) + continue; // An existing VZERO* instruction resets the state. if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) { @@ -194,30 +200,30 @@ } // Shortcut: don't need to check regular instructions in dirty state. - if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY) + if (!IsControlFlow && CurState == EXITS_DIRTY) continue; - if (hasYmmReg(MI)) { - // We found a ymm-using instruction; this could be an AVX instruction, - // or it could be control flow. + if (hasYmmOrZmmReg(MI)) { + // We found a ymm/zmm-using instruction; this could be an AVX/AVX512 + // instruction, or it could be control flow. CurState = EXITS_DIRTY; continue; } // Check for control-flow out of the current function (which might // indirectly execute SSE instructions). - if (!IsControlFlow || IsReturnFromX86INTR) + if (!IsControlFlow) continue; - // If the call won't clobber any YMM register, skip it as well. It usually - // happens on helper function calls (such as '_chkstk', '_ftol2') where - // standard calling convention is not used (RegMask is not used to mark - // register clobbered and register usage (def/imp-def/use) is well-defined - // and explicitly specified. - if (MI.isCall() && !callClobbersAnyYmmReg(MI)) + // If the call has no RegMask, skip it as well. It usually happens on + // helper function calls (such as '_chkstk', '_ftol2') where standard + // calling convention is not used (RegMask is not used to mark register + // clobbered and register usage (def/imp-def/use) is well-defined and + // explicitly specified. + if (IsCall && !callHasRegMask(MI)) continue; - // The VZEROUPPER instruction resets the upper 128 bits of all AVX + // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15 // registers. In addition, the processor changes back to Clean state, after // which execution of SSE instructions or AVX instructions has no transition // penalty. Add the VZEROUPPER instruction before any function call/return @@ -226,7 +232,7 @@ // predecessor block. if (CurState == EXITS_DIRTY) { // After the inserted VZEROUPPER the state becomes clean again, but - // other YMM may appear before other subsequent calls or even before + // other YMM/ZMM may appear before other subsequent calls or even before // the end of the BB. insertVZeroUpper(MI, MBB); CurState = EXITS_CLEAN; @@ -257,30 +263,32 @@ /// function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget(); - if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite()) + if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite()) return false; TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); EverMadeChange = false; IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR; - bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); + bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI); - // Fast check: if the function doesn't use any ymm registers, we don't need - // to insert any VZEROUPPER instructions. This is constant-time, so it is - // cheap in the common case of no ymm use. - bool YMMUsed = FnHasLiveInYmm; - if (!YMMUsed) { - const TargetRegisterClass *RC = &X86::VR256RegClass; - for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; - i++) { - if (!MRI.reg_nodbg_empty(*i)) { - YMMUsed = true; - break; + // Fast check: if the function doesn't use any ymm/zmm registers, we don't + // need to insert any VZEROUPPER instructions. This is constant-time, so it + // is cheap in the common case of no ymm/zmm use. + bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm; + const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass}; + for (auto *RC : RCs) { + if (!YmmOrZmmUsed) { + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; + i++) { + if (!MRI.reg_nodbg_empty(*i)) { + YmmOrZmmUsed = true; + break; + } } } } - if (!YMMUsed) { + if (!YmmOrZmmUsed) { return false; } @@ -294,9 +302,9 @@ for (MachineBasicBlock &MBB : MF) processBasicBlock(MBB); - // If any YMM regs are live-in to this function, add the entry block to the - // DirtySuccessors list - if (FnHasLiveInYmm) + // If any YMM/ZMM regs are live-in to this function, add the entry block to + // the DirtySuccessors list + if (FnHasLiveInYmmOrZmm) addDirtySuccessor(MF.front()); // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -234,6 +234,7 @@ ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i8: @@ -241,6 +242,7 @@ ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %a %2 = load <32 x i8>, <32 x i8>* %b @@ -563,6 +565,7 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8: @@ -570,6 +573,7 @@ ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a %2 = load <64 x i8>, <64 x i8>* %b @@ -727,6 +731,7 @@ ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v16i16: @@ -734,6 +739,7 @@ ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a %2 = load <16 x i16>, <16 x i16>* %b @@ -889,6 +895,7 @@ ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, (%rax) ; AVX512F-NEXT: vpmovdw %zmm1, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16: @@ -896,6 +903,7 @@ ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a %2 = load <32 x i16>, <32 x i16>* %b @@ -1139,6 +1147,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i8_2: @@ -1146,6 +1155,7 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %a %2 = load <32 x i8>, <32 x i8>* %b @@ -1393,6 +1403,7 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8_2: @@ -1400,6 +1411,7 @@ ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a %2 = load <64 x i8>, <64 x i8>* %b @@ -1558,6 +1570,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v16i16_2: @@ -1565,6 +1578,7 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a %2 = load <16 x i16>, <16 x i16>* %b @@ -1720,6 +1734,7 @@ ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, (%rax) ; AVX512F-NEXT: vpmovdw %zmm1, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16_2: @@ -1727,6 +1742,7 @@ ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a %2 = load <32 x i16>, <32 x i16>* %b @@ -1925,6 +1941,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i8_const: @@ -1932,6 +1949,7 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %a %2 = zext <32 x i8> %1 to <32 x i32> @@ -2149,6 +2167,7 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vmovdqu %ymm2, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8_const: @@ -2156,6 +2175,7 @@ ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a %2 = zext <64 x i8> %1 to <64 x i32> @@ -2289,6 +2309,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v16i16_const: @@ -2296,6 +2317,7 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a %2 = zext <16 x i16> %1 to <16 x i32> @@ -2412,6 +2434,7 @@ ; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovdw %zmm1, (%rax) ; AVX512F-NEXT: vpmovdw %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16_const: @@ -2419,6 +2442,7 @@ ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a %2 = zext <32 x i16> %1 to <32 x i32> Index: test/CodeGen/X86/avx-intel-ocl.ll =================================================================== --- test/CodeGen/X86/avx-intel-ocl.ll +++ test/CodeGen/X86/avx-intel-ocl.ll @@ -3,7 +3,20 @@ ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=X64 %s -declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) +declare void @foo_sse(); +define intel_ocl_bicc void @foo_preseved_ymm() { +call void @foo_sse() +ret void +} + +define <16 x float> @foo_dirty_state(<16 x float> %a) nounwind { + %res = fadd <16 x float> %a, %a + call intel_ocl_bicc void @foo_preseved_ymm() + call intel_ocl_bicc void @foo_preseved_ymm() + ret <16 x float> %res +} + +declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) declare <16 x float> @func_float16(<16 x float>, <16 x float>) declare i32 @func_int(i32, i32) Index: test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-x86.ll +++ test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2773,6 +2773,7 @@ ; AVX512VL-LABEL: test_x86_avx_cvt_pd2_ps_256: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) ; <<4 x float>> [#uses=1] ret <4 x float> %res @@ -2790,6 +2791,7 @@ ; AVX512VL-LABEL: test_x86_avx_cvt_pd2dq_256: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res @@ -2834,6 +2836,7 @@ ; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res @@ -2985,18 +2988,12 @@ define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) { -; AVX-LABEL: test_x86_avx_maskstore_pd_256: -; AVX: ## BB#0: -; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_maskstore_pd_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_maskstore_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; CHECK-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) ret void } @@ -3016,18 +3013,12 @@ define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) { -; AVX-LABEL: test_x86_avx_maskstore_ps_256: -; AVX: ## BB#0: -; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_maskstore_ps_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_maskstore_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) ret void } @@ -3099,16 +3090,11 @@ define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) { -; AVX-LABEL: test_x86_avx_movmsk_pd_256: -; AVX: ## BB#0: -; AVX-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_movmsk_pd_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_movmsk_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ; [#uses=1] ret i32 %res } @@ -3116,16 +3102,11 @@ define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) { -; AVX-LABEL: test_x86_avx_movmsk_ps_256: -; AVX: ## BB#0: -; AVX-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_movmsk_ps_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_movmsk_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ; [#uses=1] ret i32 %res } @@ -3138,20 +3119,13 @@ define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) { -; AVX-LABEL: test_x86_avx_ptestc_256: -; AVX: ## BB#0: -; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] -; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_ptestc_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] -; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_ptestc_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] +; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] +; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -3159,20 +3133,13 @@ define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) { -; AVX-LABEL: test_x86_avx_ptestnzc_256: -; AVX: ## BB#0: -; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_ptestnzc_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_ptestnzc_256: +; CHECK: ## BB#0: +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] +; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -3180,20 +3147,13 @@ define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) { -; AVX-LABEL: test_x86_avx_ptestz_256: -; AVX: ## BB#0: -; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_ptestz_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] -; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_ptestz_256: +; CHECK: ## BB#0: +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1] +; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) ; [#uses=1] ret i32 %res } @@ -3421,20 +3381,13 @@ define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) { -; AVX-LABEL: test_x86_avx_vtestc_pd_256: -; AVX: ## BB#0: -; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] -; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_vtestc_pd_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] -; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_vtestc_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] +; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] +; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) ; [#uses=1] ret i32 %res } @@ -3455,20 +3408,13 @@ define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) { -; AVX-LABEL: test_x86_avx_vtestc_ps_256: -; AVX: ## BB#0: -; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] -; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_vtestc_ps_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] -; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_vtestc_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] +; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0] +; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) ; [#uses=1] ret i32 %res } @@ -3489,20 +3435,13 @@ define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) { -; AVX-LABEL: test_x86_avx_vtestnzc_pd_256: -; AVX: ## BB#0: -; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] +; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) ; [#uses=1] ret i32 %res } @@ -3523,20 +3462,13 @@ define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) { -; AVX-LABEL: test_x86_avx_vtestnzc_ps_256: -; AVX: ## BB#0: -; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] +; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) ; [#uses=1] ret i32 %res } @@ -3557,20 +3489,13 @@ define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) { -; AVX-LABEL: test_x86_avx_vtestz_pd_256: -; AVX: ## BB#0: -; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_vtestz_pd_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] -; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_vtestz_pd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1] +; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) ; [#uses=1] ret i32 %res } @@ -3591,20 +3516,13 @@ define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) { -; AVX-LABEL: test_x86_avx_vtestz_ps_256: -; AVX: ## BB#0: -; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_vtestz_ps_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] -; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] -; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_vtestz_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] +; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1] +; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) ; [#uses=1] ret i32 %res } @@ -3750,6 +3668,7 @@ ; AVX512VL-NEXT: vpaddq LCPI247_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A] ; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI247_0, kind: FK_Data_4 ; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %a2 = add <2 x i64> %a1, %a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> @@ -3770,6 +3689,7 @@ ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; AVX512VL-NEXT: vmovntps %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: retl ## encoding: [0xc3] tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind ret void @@ -3793,6 +3713,7 @@ ; AVX512VL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x57,0xc9] ; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; AVX512VL-NEXT: vmovntpd %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00] +; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %a2 = fadd <4 x double> %a1, tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind Index: test/CodeGen/X86/avx-vzeroupper.ll =================================================================== --- test/CodeGen/X86/avx-vzeroupper.ll +++ test/CodeGen/X86/avx-vzeroupper.ll @@ -1,8 +1,9 @@ ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s -; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-write | FileCheck --check-prefix=FASTYMM %s +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s +; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s -; FASTYMM-NOT: vzeroupper +; FAST-YMM-ZMM-NOT: vzeroupper ; BTVER2-NOT: vzeroupper declare i32 @foo() Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -227,16 +227,11 @@ define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) { -; AVX2-LABEL: test_x86_avx2_pmovmskb: -; AVX2: ## BB#0: -; AVX2-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0] -; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_pmovmskb: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_pmovmskb: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; [#uses=1] ret i32 %res } @@ -1179,18 +1174,12 @@ define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) { -; AVX2-LABEL: test_x86_avx2_maskstore_q_256: -; AVX2: ## BB#0: -; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08] -; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_maskstore_q_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_maskstore_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; CHECK-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) ret void } @@ -1210,18 +1199,12 @@ define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) { -; AVX2-LABEL: test_x86_avx2_maskstore_d_256: -; AVX2: ## BB#0: -; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08] -; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_maskstore_d_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_maskstore_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; CHECK-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) ret void } @@ -1522,18 +1505,12 @@ <2 x i64>, <4 x float>, i8) nounwind readonly define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) { -; AVX2-LABEL: test_x86_avx2_gather_q_ps_256: -; AVX2: ## BB#0: -; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX2-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48] -; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_gather_q_ps_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_gather_q_ps_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; CHECK-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ; ret <4 x float> %res @@ -1633,18 +1610,12 @@ <2 x i64>, <4 x i32>, i8) nounwind readonly define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) { -; AVX2-LABEL: test_x86_avx2_gather_q_d_256: -; AVX2: ## BB#0: -; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48] -; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_gather_q_d_256: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48] -; AVX512VL-NEXT: retl ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_gather_q_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; CHECK-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ; ret <4 x i32> %res Index: test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx2-vbroadcast.ll +++ test/CodeGen/X86/avx2-vbroadcast.ll @@ -1200,6 +1200,7 @@ ; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-AVX512VL-NEXT: movl %ebp, %esp ; X32-AVX512VL-NEXT: popl %ebp +; X32-AVX512VL-NEXT: vzeroupper ; X32-AVX512VL-NEXT: retl ; ; X64-AVX512VL-LABEL: isel_crash_32b: @@ -1223,6 +1224,7 @@ ; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX512VL-NEXT: movq %rbp, %rsp ; X64-AVX512VL-NEXT: popq %rbp +; X64-AVX512VL-NEXT: vzeroupper ; X64-AVX512VL-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 @@ -1347,6 +1349,7 @@ ; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-AVX512VL-NEXT: movl %ebp, %esp ; X32-AVX512VL-NEXT: popl %ebp +; X32-AVX512VL-NEXT: vzeroupper ; X32-AVX512VL-NEXT: retl ; ; X64-AVX512VL-LABEL: isel_crash_16w: @@ -1370,6 +1373,7 @@ ; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX512VL-NEXT: movq %rbp, %rsp ; X64-AVX512VL-NEXT: popq %rbp +; X64-AVX512VL-NEXT: vzeroupper ; X64-AVX512VL-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 @@ -1437,28 +1441,28 @@ } define void @isel_crash_8d(i32* %cV_R.addr) { -; X32-AVX2-LABEL: isel_crash_8d: -; X32-AVX2: ## BB#0: ## %eintry -; X32-AVX2-NEXT: pushl %ebp -; X32-AVX2-NEXT: Lcfi9: -; X32-AVX2-NEXT: .cfi_def_cfa_offset 8 -; X32-AVX2-NEXT: Lcfi10: -; X32-AVX2-NEXT: .cfi_offset %ebp, -8 -; X32-AVX2-NEXT: movl %esp, %ebp -; X32-AVX2-NEXT: Lcfi11: -; X32-AVX2-NEXT: .cfi_def_cfa_register %ebp -; X32-AVX2-NEXT: andl $-32, %esp -; X32-AVX2-NEXT: subl $128, %esp -; X32-AVX2-NEXT: movl 8(%ebp), %eax -; X32-AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0 -; X32-AVX2-NEXT: vmovaps %ymm0, (%esp) -; X32-AVX2-NEXT: vbroadcastss (%eax), %ymm1 -; X32-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX2-NEXT: movl %ebp, %esp -; X32-AVX2-NEXT: popl %ebp -; X32-AVX2-NEXT: vzeroupper -; X32-AVX2-NEXT: retl +; X32-LABEL: isel_crash_8d: +; X32: ## BB#0: ## %eintry +; X32-NEXT: pushl %ebp +; X32-NEXT: Lcfi9: +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: Lcfi10: +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: Lcfi11: +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-32, %esp +; X32-NEXT: subl $128, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-NEXT: vmovaps %ymm0, (%esp) +; X32-NEXT: vbroadcastss (%eax), %ymm1 +; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: vzeroupper +; X32-NEXT: retl ; ; X64-AVX2-LABEL: isel_crash_8d: ; X64-AVX2: ## BB#0: ## %eintry @@ -1484,28 +1488,6 @@ ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; -; X32-AVX512VL-LABEL: isel_crash_8d: -; X32-AVX512VL: ## BB#0: ## %eintry -; X32-AVX512VL-NEXT: pushl %ebp -; X32-AVX512VL-NEXT: Lcfi9: -; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 8 -; X32-AVX512VL-NEXT: Lcfi10: -; X32-AVX512VL-NEXT: .cfi_offset %ebp, -8 -; X32-AVX512VL-NEXT: movl %esp, %ebp -; X32-AVX512VL-NEXT: Lcfi11: -; X32-AVX512VL-NEXT: .cfi_def_cfa_register %ebp -; X32-AVX512VL-NEXT: andl $-32, %esp -; X32-AVX512VL-NEXT: subl $128, %esp -; X32-AVX512VL-NEXT: movl 8(%ebp), %eax -; X32-AVX512VL-NEXT: vxorps %ymm0, %ymm0, %ymm0 -; X32-AVX512VL-NEXT: vmovaps %ymm0, (%esp) -; X32-AVX512VL-NEXT: vbroadcastss (%eax), %ymm1 -; X32-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX512VL-NEXT: movl %ebp, %esp -; X32-AVX512VL-NEXT: popl %ebp -; X32-AVX512VL-NEXT: retl -; ; X64-AVX512VL-LABEL: isel_crash_8d: ; X64-AVX512VL: ## BB#0: ## %eintry ; X64-AVX512VL-NEXT: pushq %rbp @@ -1526,6 +1508,7 @@ ; X64-AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX512VL-NEXT: movq %rbp, %rsp ; X64-AVX512VL-NEXT: popq %rbp +; X64-AVX512VL-NEXT: vzeroupper ; X64-AVX512VL-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 @@ -1676,6 +1659,7 @@ ; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-AVX512VL-NEXT: movl %ebp, %esp ; X32-AVX512VL-NEXT: popl %ebp +; X32-AVX512VL-NEXT: vzeroupper ; X32-AVX512VL-NEXT: retl ; ; X64-AVX512VL-LABEL: isel_crash_4q: @@ -1698,6 +1682,7 @@ ; X64-AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX512VL-NEXT: movq %rbp, %rsp ; X64-AVX512VL-NEXT: popq %rbp +; X64-AVX512VL-NEXT: vzeroupper ; X64-AVX512VL-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 Index: test/CodeGen/X86/avx512-any_extend_load.ll =================================================================== --- test/CodeGen/X86/avx512-any_extend_load.ll +++ test/CodeGen/X86/avx512-any_extend_load.ll @@ -4,12 +4,20 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) { -; ALL-LABEL: any_extend_load_v8i64: -; ALL: # BB#0: -; ALL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; ALL-NEXT: vpmovqb %zmm0, (%rdi) -; ALL-NEXT: retq +; KNL-LABEL: any_extend_load_v8i64: +; KNL: # BB#0: +; KNL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; KNL-NEXT: vpmovqb %zmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: any_extend_load_v8i64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; SKX-NEXT: vpmovqb %zmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1 %1 = zext <8 x i8> %wide.load to <8 x i64> %2 = add nuw nsw <8 x i64> %1, @@ -33,6 +41,7 @@ ; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; SKX-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; SKX-NEXT: vpmovdb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1 %1 = zext <8 x i8> %wide.load to <8 x i32> Index: test/CodeGen/X86/avx512-arith.ll =================================================================== --- test/CodeGen/X86/avx512-arith.ll +++ test/CodeGen/X86/avx512-arith.ll @@ -233,6 +233,7 @@ ; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; SKX-LABEL: imulq128: Index: test/CodeGen/X86/avx512-calling-conv.ll =================================================================== --- test/CodeGen/X86/avx512-calling-conv.ll +++ test/CodeGen/X86/avx512-calling-conv.ll @@ -140,6 +140,7 @@ ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _func8xi1 ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX-NEXT: vpslld $31, %ymm0, %ymm0 @@ -192,6 +193,7 @@ ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _func16xi1 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; SKX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -291,6 +293,7 @@ ; SKX-NEXT: .cfi_def_cfa_offset 16 ; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _func8xi1 ; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 ; SKX-NEXT: vpmovw2m %xmm0, %k0 Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW define <16 x float> @sitof32(<16 x i32> %a) nounwind { @@ -110,40 +110,78 @@ ; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %b = sitofp <2 x i64> %a to <2 x float> ret <2 x float>%b } define <4 x float> @sltof4f32_mem(<4 x i64>* %a) { -; NODQ-LABEL: sltof4f32_mem: -; NODQ: ## BB#0: -; NODQ-NEXT: vmovdqu (%rdi), %ymm0 -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; NODQ-NEXT: retq +; KNL-LABEL: sltof4f32_mem: +; KNL: ## BB#0: +; KNL-NEXT: vmovdqu (%rdi), %ymm0 +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; KNL-NEXT: retq ; ; VLDQ-LABEL: sltof4f32_mem: ; VLDQ: ## BB#0: ; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: sltof4f32_mem: +; VLNODQ: ## BB#0: +; VLNODQ-NEXT: vmovdqu (%rdi), %ymm0 +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VLNODQ-NEXT: vzeroupper +; VLNODQ-NEXT: retq +; ; AVX512DQ-LABEL: sltof4f32_mem: ; AVX512DQ: ## BB#0: ; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: sltof4f32_mem: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a1 = load <4 x i64>, <4 x i64>* %a, align 8 %b = sitofp <4 x i64> %a1 to <4 x float> ret <4 x float>%b @@ -218,65 +256,137 @@ } define <4 x float> @sltof432(<4 x i64> %a) { -; NODQ-LABEL: sltof432: -; NODQ: ## BB#0: -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; NODQ-NEXT: retq +; KNL-LABEL: sltof432: +; KNL: ## BB#0: +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; KNL-NEXT: retq ; ; VLDQ-LABEL: sltof432: ; VLDQ: ## BB#0: ; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; VLDQ-NEXT: vzeroupper ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: sltof432: +; VLNODQ: ## BB#0: +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VLNODQ-NEXT: vzeroupper +; VLNODQ-NEXT: retq +; ; AVX512DQ-LABEL: sltof432: ; AVX512DQ: ## BB#0: ; AVX512DQ-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: sltof432: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %b = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } define <4 x float> @ultof432(<4 x i64> %a) { -; NODQ-LABEL: ultof432: -; NODQ: ## BB#0: -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; NODQ-NEXT: retq +; KNL-LABEL: ultof432: +; KNL: ## BB#0: +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, %rax +; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; KNL-NEXT: vpextrq $1, %xmm0, %rax +; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; KNL-NEXT: retq ; ; VLDQ-LABEL: ultof432: ; VLDQ: ## BB#0: ; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; VLDQ-NEXT: vzeroupper ; VLDQ-NEXT: retq ; +; VLNODQ-LABEL: ultof432: +; VLNODQ: ## BB#0: +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; VLNODQ-NEXT: vmovq %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax +; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VLNODQ-NEXT: vzeroupper +; VLNODQ-NEXT: retq +; ; AVX512DQ-LABEL: ultof432: ; AVX512DQ: ## BB#0: ; AVX512DQ-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: ultof432: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax +; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %b = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } @@ -355,17 +465,33 @@ } define <4 x i32> @fptoui_128(<4 x float> %a) nounwind { -; NOVL-LABEL: fptoui_128: -; NOVL: ## BB#0: -; NOVL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 -; NOVL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; NOVL-NEXT: retq +; KNL-LABEL: fptoui_128: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vcvttps2udq %zmm0, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq ; ; VL-LABEL: fptoui_128: ; VL: ## BB#0: ; VL-NEXT: vcvttps2udq %xmm0, %xmm0 ; VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_128: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: fptoui_128: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vcvttps2udq %zmm0, %zmm0 +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %b = fptoui <4 x float> %a to <4 x i32> ret <4 x i32> %b } @@ -380,17 +506,34 @@ } define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind { -; NOVL-LABEL: fptoui_256d: -; NOVL: ## BB#0: -; NOVL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 -; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; NOVL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 -; NOVL-NEXT: retq +; KNL-LABEL: fptoui_256d: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: retq ; ; VL-LABEL: fptoui_256d: ; VL: ## BB#0: ; VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; VL-NEXT: vzeroupper ; VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_256d: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: fptoui_256d: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: vcvttpd2udq %zmm0, %ymm0 +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %b = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %b } @@ -448,10 +591,16 @@ } define <4 x i32> @fptosi03(<4 x double> %a) { -; ALL-LABEL: fptosi03: -; ALL: ## BB#0: -; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0 -; ALL-NEXT: retq +; KNL-LABEL: fptosi03: +; KNL: ## BB#0: +; KNL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; KNL-NEXT: retq +; +; AVX512-LABEL: fptosi03: +; AVX512: ## BB#0: +; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %b = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %b } @@ -475,29 +624,54 @@ } define <4 x float> @fptrunc01(<4 x double> %b) { -; ALL-LABEL: fptrunc01: -; ALL: ## BB#0: -; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; ALL-NEXT: retq +; KNL-LABEL: fptrunc01: +; KNL: ## BB#0: +; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; KNL-NEXT: retq +; +; AVX512-LABEL: fptrunc01: +; AVX512: ## BB#0: +; AVX512-NEXT: vcvtpd2ps %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a = fptrunc <4 x double> %b to <4 x float> ret <4 x float> %a } define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) { -; NOVL-LABEL: fptrunc02: -; NOVL: ## BB#0: -; NOVL-NEXT: vpslld $31, %xmm1, %xmm1 -; NOVL-NEXT: vpsrad $31, %xmm1, %xmm1 -; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; NOVL-NEXT: vpand %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: retq +; KNL-LABEL: fptrunc02: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0 +; KNL-NEXT: vpand %xmm0, %xmm1, %xmm0 +; KNL-NEXT: retq ; ; VL-LABEL: fptrunc02: ; VL: ## BB#0: ; VL-NEXT: vpslld $31, %xmm1, %xmm1 ; VL-NEXT: vptestmd %xmm1, %xmm1, %k1 ; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} +; VL-NEXT: vzeroupper ; VL-NEXT: retq +; +; AVX512DQ-LABEL: fptrunc02: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX512DQ-NEXT: vcvtpd2ps %ymm0, %xmm0 +; AVX512DQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: fptrunc02: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX512BW-NEXT: vcvtpd2ps %ymm0, %xmm0 +; AVX512BW-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a = fptrunc <4 x double> %b to <4 x float> %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer ret <4 x float> %c @@ -761,17 +935,33 @@ } define <4 x float> @uitof32_128(<4 x i32> %a) nounwind { -; NOVL-LABEL: uitof32_128: -; NOVL: ## BB#0: -; NOVL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 -; NOVL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 -; NOVL-NEXT: retq +; KNL-LABEL: uitof32_128: +; KNL: ## BB#0: +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0 +; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; KNL-NEXT: retq ; ; VL-LABEL: uitof32_128: ; VL: ## BB#0: ; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; VL-NEXT: retq +; +; AVX512DQ-LABEL: uitof32_128: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512DQ-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: uitof32_128: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %b = uitofp <4 x i32> %a to <4 x float> ret <4 x float> %b } Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -1342,12 +1342,20 @@ } define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { -; ALL-LABEL: trunc_16i32_to_16i1: -; ALL: ## BB#0: -; ALL-NEXT: vpslld $31, %zmm0, %zmm0 -; ALL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; ALL-NEXT: kmovw %k0, %eax -; ALL-NEXT: retq +; KNL-LABEL: trunc_16i32_to_16i1: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: trunc_16i32_to_16i1: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %mask_b = trunc <16 x i32>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask @@ -1447,6 +1455,7 @@ ; SKX: ## BB#0: ; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = icmp slt <8 x i32> %a1, %a2 %y = sext <8 x i1> %x to <8 x i16> @@ -1488,11 +1497,18 @@ } define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { -; ALL-LABEL: extload_v8i64: -; ALL: ## BB#0: -; ALL-NEXT: vpmovsxbq (%rdi), %zmm0 -; ALL-NEXT: vmovdqa64 %zmm0, (%rsi) -; ALL-NEXT: retq +; KNL-LABEL: extload_v8i64: +; KNL: ## BB#0: +; KNL-NEXT: vpmovsxbq (%rdi), %zmm0 +; KNL-NEXT: vmovdqa64 %zmm0, (%rsi) +; KNL-NEXT: retq +; +; SKX-LABEL: extload_v8i64: +; SKX: ## BB#0: +; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 +; SKX-NEXT: vmovdqa64 %zmm0, (%rsi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %sign_load = load <8 x i8>, <8 x i8>* %a %c = sext <8 x i8> %sign_load to <8 x i64> store <8 x i64> %c, <8 x i64>* %res Index: test/CodeGen/X86/avx512-extract-subvector.ll =================================================================== --- test/CodeGen/X86/avx512-extract-subvector.ll +++ test/CodeGen/X86/avx512-extract-subvector.ll @@ -6,6 +6,7 @@ ; SKX-LABEL: extract_subvector128_v32i16: ; SKX: ## BB#0: ; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> ret <8 x i16> %r1 @@ -15,6 +16,7 @@ ; SKX-LABEL: extract_subvector128_v32i16_first_element: ; SKX: ## BB#0: ; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> ret <8 x i16> %r1 @@ -24,6 +26,7 @@ ; SKX-LABEL: extract_subvector128_v64i8: ; SKX: ## BB#0: ; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> ret <16 x i8> %r1 @@ -33,6 +36,7 @@ ; SKX-LABEL: extract_subvector128_v64i8_first_element: ; SKX: ## BB#0: ; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> ret <16 x i8> %r1 @@ -61,6 +65,7 @@ ; SKX-LABEL: extract_subvector256_v8f64_store: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> @@ -73,6 +78,7 @@ ; SKX-LABEL: extract_subvector256_v8f32_store: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> @@ -85,6 +91,7 @@ ; SKX-LABEL: extract_subvector256_v4i64_store: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> @@ -97,6 +104,7 @@ ; SKX-LABEL: extract_subvector256_v8i32_store: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> @@ -109,6 +117,7 @@ ; SKX-LABEL: extract_subvector256_v16i16_store: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> @@ -121,6 +130,7 @@ ; SKX-LABEL: extract_subvector256_v32i8_store: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> @@ -133,6 +143,7 @@ ; SKX-LABEL: extract_subvector256_v4f64_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> @@ -145,6 +156,7 @@ ; SKX-LABEL: extract_subvector256_v4f64_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> @@ -157,6 +169,7 @@ ; SKX-LABEL: extract_subvector256_v4f32_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> @@ -169,6 +182,7 @@ ; SKX-LABEL: extract_subvector256_v4f32_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> @@ -181,6 +195,7 @@ ; SKX-LABEL: extract_subvector256_v2i64_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> @@ -193,6 +208,7 @@ ; SKX-LABEL: extract_subvector256_v2i64_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> @@ -205,6 +221,7 @@ ; SKX-LABEL: extract_subvector256_v4i32_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> @@ -217,6 +234,7 @@ ; SKX-LABEL: extract_subvector256_v4i32_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> @@ -229,6 +247,7 @@ ; SKX-LABEL: extract_subvector256_v8i16_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> @@ -241,6 +260,7 @@ ; SKX-LABEL: extract_subvector256_v8i16_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> @@ -253,6 +273,7 @@ ; SKX-LABEL: extract_subvector256_v16i8_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> @@ -265,6 +286,7 @@ ; SKX-LABEL: extract_subvector256_v16i8_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> @@ -277,6 +299,7 @@ ; SKX-LABEL: extract_subvector512_v2f64_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> @@ -289,6 +312,7 @@ ; SKX-LABEL: extract_subvector512_v2f64_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> @@ -301,6 +325,7 @@ ; SKX-LABEL: extract_subvector512_v4f32_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> @@ -313,6 +338,7 @@ ; SKX-LABEL: extract_subvector512_v4f32_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> @@ -325,6 +351,7 @@ ; SKX-LABEL: extract_subvector512_v2i64_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> @@ -337,6 +364,7 @@ ; SKX-LABEL: extract_subvector512_v2i64_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> @@ -349,6 +377,7 @@ ; SKX-LABEL: extract_subvector512_v4i32_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> @@ -361,6 +390,7 @@ ; SKX-LABEL: extract_subvector512_v4i32_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> @@ -373,6 +403,7 @@ ; SKX-LABEL: extract_subvector512_v8i16_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <8 x i32> @@ -385,6 +416,7 @@ ; SKX-LABEL: extract_subvector512_v16i8_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> @@ -397,6 +429,7 @@ ; SKX-LABEL: extract_subvector512_v16i8_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> @@ -409,6 +442,7 @@ ; SKX-LABEL: extract_subvector512_v4f64_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> @@ -421,6 +455,7 @@ ; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> @@ -433,6 +468,7 @@ ; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_32: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> @@ -445,6 +481,7 @@ ; SKX-LABEL: extract_subvector512_v8f32_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> @@ -457,6 +494,7 @@ ; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> @@ -469,6 +507,7 @@ ; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_32: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> @@ -481,6 +520,7 @@ ; SKX-LABEL: extract_subvector512_v4i64_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> @@ -493,6 +533,7 @@ ; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> @@ -505,6 +546,7 @@ ; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_32: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> @@ -517,6 +559,7 @@ ; SKX-LABEL: extract_subvector512_v8i32_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> @@ -529,6 +572,7 @@ ; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> @@ -541,6 +585,7 @@ ; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_32: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> @@ -553,6 +598,7 @@ ; SKX-LABEL: extract_subvector512_v16i16_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> @@ -565,6 +611,7 @@ ; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> @@ -577,6 +624,7 @@ ; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_32: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> @@ -589,6 +637,7 @@ ; SKX-LABEL: extract_subvector512_v32i8_store_lo: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> @@ -601,6 +650,7 @@ ; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_16: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovups %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> @@ -613,6 +663,7 @@ ; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_32: ; SKX: ## BB#0: ## %entry ; SKX-NEXT: vmovaps %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> @@ -654,6 +705,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextractf32x4 $1, %zmm1, %xmm0 {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <16 x float> @@ -669,6 +721,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextractf32x4 $1, %zmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = bitcast <8 x double> %__A to <16 x float> @@ -684,6 +737,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextractf64x2 $1, %ymm1, %xmm0 {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %shuffle = shufflevector <4 x double> %__A, <4 x double> undef, <2 x i32> @@ -698,6 +752,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %shuffle = shufflevector <4 x double> %__A, <4 x double> undef, <2 x i32> @@ -712,6 +767,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm0 {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %shuffle = shufflevector <4 x i64> %__A, <4 x i64> undef, <2 x i32> @@ -726,6 +782,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0 {%k1} {z} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %shuffle = shufflevector <4 x i64> %__A, <4 x i64> undef, <2 x i32> @@ -740,6 +797,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextractf32x4 $1, %ymm1, %xmm0 {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %shuffle = shufflevector <8 x float> %__A, <8 x float> undef, <4 x i32> @@ -754,6 +812,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %shuffle = shufflevector <8 x float> %__A, <8 x float> undef, <4 x i32> @@ -768,6 +827,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextracti32x4 $1, %ymm1, %xmm0 {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = bitcast <4 x i64> %__A to <8 x i32> @@ -785,6 +845,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 {%k1} {z} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %0 = bitcast <4 x i64> %__A to <8 x i32> @@ -827,6 +888,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextractf64x2 $3, %zmm1, %xmm0 {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %shuffle = shufflevector <8 x double> %__A, <8 x double> undef, <2 x i32> @@ -841,6 +903,7 @@ ; SKX: ## BB#0: ## %entry ; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq entry: %shuffle = shufflevector <8 x double> %__A, <8 x double> undef, <2 x i32> Index: test/CodeGen/X86/avx512-gather-scatter-intrin.ll =================================================================== --- test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -19,6 +19,7 @@ ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) %ind2 = add <16 x i32> %ind, @@ -34,6 +35,7 @@ ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) %ind2 = add <8 x i32> %ind, @@ -49,6 +51,7 @@ ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, @@ -64,6 +67,7 @@ ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, @@ -91,6 +95,7 @@ ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) %ind2 = add <16 x i32> %ind, @@ -106,6 +111,7 @@ ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, @@ -121,6 +127,7 @@ ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) %ind2 = add <8 x i64> %ind, @@ -136,6 +143,7 @@ ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) %ind2 = add <8 x i32> %ind, @@ -149,6 +157,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1} ; CHECK-NEXT: vmovapd %zmm1, (%rdx) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) store <8 x double> %x, <8 x double>* %stbuf @@ -161,6 +170,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1} ; CHECK-NEXT: vmovapd %zmm1, (%rdx) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) store <8 x double> %x, <8 x double>* %stbuf @@ -195,6 +205,7 @@ ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vmovapd (%rdi), %zmm1 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <8 x double>, <8 x double>* %src, align 64 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4) @@ -207,6 +218,7 @@ ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vmovapd (%rdi), %zmm1 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <8 x double>, <8 x double>* %src, align 64 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4) @@ -219,6 +231,7 @@ ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmovaps (%rdi), %zmm1 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <16 x float>, <16 x float>* %src, align 64 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4) @@ -231,6 +244,7 @@ ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vmovaps (%rdi), %ymm1 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <8 x float>, <8 x float>* %src, align 32 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4) @@ -245,6 +259,7 @@ ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4) %ind2 = add <8 x i64> %ind, @@ -267,6 +282,7 @@ ; CHECK-NEXT: movb $120, %al ; CHECK-NEXT: kmovb %eax, %k1 ; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0) call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1) @@ -300,7 +316,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} -; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) @@ -336,7 +352,7 @@ ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8) %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8) @@ -391,6 +407,7 @@ ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1} ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2) @@ -409,6 +426,7 @@ ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2) @@ -474,7 +492,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} -; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) @@ -593,6 +611,7 @@ ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2) call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4) @@ -608,6 +627,7 @@ ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2) call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4) @@ -653,6 +673,7 @@ ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2) call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4) @@ -668,6 +689,7 @@ ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2) call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4) @@ -713,6 +735,7 @@ ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2) call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4) @@ -728,6 +751,7 @@ ; CHECK-NEXT: kxnorw %k0, %k0, %k2 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2} ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2) call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4) @@ -773,6 +797,7 @@ ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2) call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4) @@ -788,6 +813,7 @@ ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2) call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4) @@ -807,6 +833,7 @@ ; CHECK-NEXT: movb $96, %al ; CHECK-NEXT: kmovb %eax, %k1 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2) call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4) Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -134,6 +134,7 @@ ; SKX-NEXT: vmovd %edi, %xmm1 ; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %e = extractelement <16 x float> %x, i32 %ind ret float %e @@ -154,6 +155,7 @@ ; SKX-NEXT: vmovq %rax, %xmm1 ; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %e = extractelement <8 x double> %x, i32 %ind ret double %e @@ -172,6 +174,7 @@ ; SKX-NEXT: vmovd %edi, %xmm1 ; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %e = extractelement <8 x float> %x, i32 %ind ret float %e @@ -190,6 +193,7 @@ ; SKX-NEXT: vmovd %edi, %xmm1 ; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %e = extractelement <16 x i32> %x, i32 %ind ret i32 %e @@ -264,6 +268,7 @@ ; SKX-NEXT: testb %al, %al ; SKX-NEXT: cmoveq %rsi, %rdi ; SKX-NEXT: movq %rdi, %rax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %cmpvector_func.i = icmp slt <16 x i64> %a, %b %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0 @@ -328,6 +333,7 @@ ; SKX-NEXT: testb %al, %al ; SKX-NEXT: cmoveq %rsi, %rdi ; SKX-NEXT: movq %rdi, %rax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %cmpvector_func.i = icmp slt <8 x i64> %a, %b %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4 @@ -387,6 +393,7 @@ ; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; SKX-NEXT: vpmovd2m %zmm2, %k0 ; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 128 %a1 = bitcast i16 %a to <16 x i1> @@ -423,6 +430,7 @@ ; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; SKX-NEXT: vpmovq2m %zmm2, %k0 ; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = load i1 , i1 * %addr, align 128 %a1 = bitcast i8 %a to <8 x i1> @@ -444,6 +452,7 @@ ; SKX-NEXT: vpextrq $1, %xmm0, %rax ; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0 ; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = extractelement <8 x i64> %x, i32 1 %r2 = extractelement <8 x i64> %x, i32 3 @@ -464,6 +473,7 @@ ; SKX-NEXT: vpextrq $1, %xmm0, %rax ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = extractelement <4 x i64> %x, i32 1 %r2 = extractelement <4 x i64> %x, i32 3 @@ -502,6 +512,7 @@ ; SKX-NEXT: vpextrd $1, %xmm0, %eax ; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ; SKX-NEXT: vpextrd $1, %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = extractelement <16 x i32> %x, i32 1 %r2 = extractelement <16 x i32> %x, i32 5 @@ -522,6 +533,7 @@ ; SKX-NEXT: vpextrd $1, %xmm0, %eax ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrd $1, %xmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = extractelement <8 x i32> %x, i32 1 %r2 = extractelement <8 x i32> %x, i32 5 @@ -562,6 +574,7 @@ ; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) ; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = extractelement <32 x i16> %x, i32 1 %r2 = extractelement <32 x i16> %x, i32 9 @@ -584,6 +597,7 @@ ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) ; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = extractelement <16 x i16> %x, i32 1 %r2 = extractelement <16 x i16> %x, i32 9 @@ -626,6 +640,7 @@ ; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) ; SKX-NEXT: ## kill: %AL %AL %EAX +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = extractelement <64 x i8> %x, i32 1 %r2 = extractelement <64 x i8> %x, i32 17 @@ -648,6 +663,7 @@ ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) ; SKX-NEXT: ## kill: %AL %AL %EAX +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %r1 = extractelement <32 x i8> %x, i32 1 %r2 = extractelement <32 x i8> %x, i32 17 @@ -1218,6 +1234,7 @@ ; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; SKX-NEXT: vpmovw2m %zmm2, %k0 ; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %cmp_res_i1 = icmp ult i32 %a, %b %cmp_cmp_vec = icmp ult <32 x i32> %x, %y @@ -1408,6 +1425,7 @@ ; SKX-NEXT: kshiftrd $31, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax ; SKX-NEXT: andl $1, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <32 x i8> %a, %b %t2 = extractelement <32 x i1> %t1, i32 2 @@ -1440,6 +1458,7 @@ ; SKX-NEXT: sete %al ; SKX-NEXT: addb $3, %al ; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <64 x i8> %a, %b %t2 = extractelement <64 x i1> %t1, i32 63 @@ -1506,6 +1525,7 @@ ; SKX-NEXT: movq (%rsp,%rdi,8), %rax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <4 x i64> %t1, i32 %index ret i64 %t2 @@ -1526,6 +1546,7 @@ ; SKX-NEXT: vmovq %rax, %xmm1 ; SKX-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <8 x i64> %t1, i32 %index ret i64 %t2 @@ -1590,6 +1611,7 @@ ; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <4 x double> %t1, i32 %index ret double %t2 @@ -1610,6 +1632,7 @@ ; SKX-NEXT: vmovq %rax, %xmm1 ; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <8 x double> %t1, i32 %index ret double %t2 @@ -1648,6 +1671,7 @@ ; SKX-NEXT: vmovd %edi, %xmm1 ; SKX-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <8 x i32> %t1, i32 %index ret i32 %t2 @@ -1666,6 +1690,7 @@ ; SKX-NEXT: vmovd %edi, %xmm1 ; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; SKX-NEXT: vmovd %xmm0, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <16 x i32> %t1, i32 %index ret i32 %t2 @@ -1704,6 +1729,7 @@ ; SKX-NEXT: vmovd %edi, %xmm1 ; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; SKX-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <8 x float> %t1, i32 %index ret float %t2 @@ -1722,6 +1748,7 @@ ; SKX-NEXT: vmovd %edi, %xmm1 ; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 ; SKX-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <16 x float> %t1, i32 %index ret float %t2 @@ -1786,6 +1813,7 @@ ; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <16 x i16> %t1, i32 %index ret i16 %t2 @@ -1860,6 +1888,7 @@ ; SKX-NEXT: movb (%rdi,%rax), %al ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t2 = extractelement <32 x i8> %t1, i32 %index Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -168,14 +168,24 @@ } define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: zext_test1: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kshiftlw $10, %k0, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: retq +; KNL-LABEL: zext_test1: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: zext_test1: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; SKX-NEXT: kshiftlw $10, %k0, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i32 @@ -183,15 +193,26 @@ } define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: zext_test2: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kshiftlw $10, %k0, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: ## kill: %AX %AX %EAX -; CHECK-NEXT: retq +; KNL-LABEL: zext_test2: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: ## kill: %AX %AX %EAX +; KNL-NEXT: retq +; +; SKX-LABEL: zext_test2: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; SKX-NEXT: kshiftlw $10, %k0, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: ## kill: %AX %AX %EAX +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i16 @@ -199,15 +220,26 @@ } define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: zext_test3: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kshiftlw $10, %k0, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: ## kill: %AL %AL %EAX -; CHECK-NEXT: retq +; KNL-LABEL: zext_test3: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: retq +; +; SKX-LABEL: zext_test3: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; SKX-NEXT: kshiftlw $10, %k0, %k0 +; SKX-NEXT: kshiftrw $15, %k0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: andl $1, %eax +; SKX-NEXT: ## kill: %AL %AL %EAX +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i8 @@ -257,6 +289,7 @@ ; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 ; SKX-NEXT: kandnw %k0, %k1, %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x_gt_y = icmp sgt <4 x i64> %x, %y %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1 @@ -356,10 +389,12 @@ ; SKX-NEXT: ## BB#2: ; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; SKX-NEXT: LBB17_1: ; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %cond = icmp sgt i32 %a1, %b1 %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer @@ -662,6 +697,7 @@ ; SKX-NEXT: kshiftlb $7, %k2, %k1 ; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %b = bitcast i8 %a to <8 x i1> %b1 = bitcast i16 %y to <16 x i1> @@ -985,9 +1021,11 @@ ; SKX-NEXT: je LBB41_2 ; SKX-NEXT: ## BB#1: ## %L1 ; SKX-NEXT: vmovapd %zmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; SKX-NEXT: LBB41_2: ## %L2 ; SKX-NEXT: vmovapd %zmm0, 8(%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %addr1 = getelementptr double, double * %base, i64 0 %addr2 = getelementptr double, double * %base, i64 1 @@ -1338,10 +1376,12 @@ ; SKX-NEXT: ## BB#1: ## %L1 ; SKX-NEXT: vmovaps %zmm0, (%rdi) ; SKX-NEXT: vmovaps %zmm1, 64(%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; SKX-NEXT: LBB42_2: ## %L2 ; SKX-NEXT: vmovaps %zmm0, 4(%rdi) ; SKX-NEXT: vmovaps %zmm1, 68(%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %addr1 = getelementptr float, float * %base, i64 0 %addr2 = getelementptr float, float * %base, i64 1 @@ -1573,6 +1613,7 @@ ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 ; SKX-NEXT: vpmovb2m %ymm0, %k0 ; SKX-NEXT: kmovd %k0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq store <32 x i1> %v, <32 x i1>* %a ret void @@ -1600,6 +1641,7 @@ ; SKX-NEXT: vpsllw $15, %zmm0, %zmm0 ; SKX-NEXT: vpmovw2m %zmm0, %k0 ; SKX-NEXT: kmovd %k0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %v1 = trunc <32 x i16> %v to <32 x i1> store <32 x i1> %v1, <32 x i1>* %a @@ -1934,6 +1976,7 @@ ; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: kmovq %k0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq store <64 x i1> %v, <64 x i1>* %a ret void @@ -1955,6 +1998,7 @@ ; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; SKX-NEXT: kmovb %k0, %eax ; SKX-NEXT: addl %eax, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> @@ -1965,13 +2009,22 @@ } define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { -; CHECK-LABEL: test_bitcast_v16i1_zext: -; CHECK: ## BB#0: -; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: addl %eax, %eax -; CHECK-NEXT: retq +; KNL-LABEL: test_bitcast_v16i1_zext: +; KNL: ## BB#0: +; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: addl %eax, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test_bitcast_v16i1_zext: +; SKX: ## BB#0: +; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: addl %eax, %eax +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask1 = bitcast <16 x i1> %v1 to i16 %val = zext i16 %mask1 to i32 Index: test/CodeGen/X86/avx512-mask-spills.ll =================================================================== --- test/CodeGen/X86/avx512-mask-spills.ll +++ test/CodeGen/X86/avx512-mask-spills.ll @@ -37,6 +37,7 @@ ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload @@ -62,6 +63,7 @@ ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload @@ -86,6 +88,7 @@ ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill ; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload ; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload @@ -110,6 +113,7 @@ ; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill ; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload ; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload Index: test/CodeGen/X86/avx512-masked-memop-64-32.ll =================================================================== --- test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -32,6 +32,7 @@ ; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) @@ -56,6 +57,7 @@ ; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) @@ -67,6 +69,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) ret void @@ -143,6 +146,7 @@ ; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 ; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_store_16i64: @@ -152,6 +156,7 @@ ; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask) ret void @@ -167,6 +172,7 @@ ; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 ; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_store_16f64: @@ -176,6 +182,7 @@ ; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask) ret void Index: test/CodeGen/X86/avx512-masked_memop-16-8.ll =================================================================== --- test/CodeGen/X86/avx512-masked_memop-16-8.ll +++ test/CodeGen/X86/avx512-masked_memop-16-8.ll @@ -93,6 +93,7 @@ ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 ; CHECK-NEXT: vpmovb2m %ymm0, %k1 ; CHECK-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask) ret void @@ -105,6 +106,7 @@ ; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0 ; CHECK-NEXT: vpmovb2m %zmm0, %k1 ; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask) ret void @@ -129,6 +131,7 @@ ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 ; CHECK-NEXT: vpmovb2m %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask) ret void @@ -141,6 +144,7 @@ ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 ; CHECK-NEXT: vpmovb2m %ymm0, %k1 ; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask) ret void Index: test/CodeGen/X86/avx512-skx-insert-subvec.ll =================================================================== --- test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -35,6 +35,7 @@ ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; CHECK-NEXT: vpmovq2m %zmm0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <2 x i1> %a, <2 x i1> zeroinitializer, <8 x i32> ret <8 x i1> %res Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -8,6 +8,7 @@ ; ALL-LABEL: trunc_16x32_to_16x8: ; ALL: ## BB#0: ; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <16 x i32> %i to <16 x i8> ret <16 x i8> %x @@ -17,6 +18,7 @@ ; ALL-LABEL: trunc_8x64_to_8x16: ; ALL: ## BB#0: ; ALL-NEXT: vpmovqw %zmm0, %xmm0 +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <8 x i64> %i to <8 x i16> ret <8 x i16> %x @@ -35,6 +37,7 @@ ; ALL-LABEL: trunc_qb_512: ; ALL: ## BB#0: ; ALL-NEXT: vpmovqw %zmm0, %xmm0 +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <8 x i64> %i to <8 x i8> ret <8 x i8> %x @@ -44,6 +47,7 @@ ; ALL-LABEL: trunc_qb_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovqb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <8 x i64> %i to <8 x i8> store <8 x i8> %x, <8 x i8>* %res @@ -56,11 +60,13 @@ ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qb_256: ; SKX: ## BB#0: ; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <4 x i64> %i to <4 x i8> ret <4 x i8> %x @@ -73,11 +79,13 @@ ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovd %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qb_256_mem: ; SKX: ## BB#0: ; SKX-NEXT: vpmovqb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <4 x i64> %i to <4 x i8> store <4 x i8> %x, <4 x i8>* %res @@ -112,6 +120,7 @@ ; ALL-LABEL: trunc_qw_512: ; ALL: ## BB#0: ; ALL-NEXT: vpmovqw %zmm0, %xmm0 +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <8 x i64> %i to <8 x i16> ret <8 x i16> %x @@ -121,6 +130,7 @@ ; ALL-LABEL: trunc_qw_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovqw %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <8 x i64> %i to <8 x i16> store <8 x i16> %x, <8 x i16>* %res @@ -133,11 +143,13 @@ ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qw_256: ; SKX: ## BB#0: ; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <4 x i64> %i to <4 x i16> ret <4 x i16> %x @@ -150,11 +162,13 @@ ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qw_256_mem: ; SKX: ## BB#0: ; SKX-NEXT: vpmovqw %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <4 x i64> %i to <4 x i16> store <4 x i16> %x, <4 x i16>* %res @@ -199,6 +213,7 @@ ; ALL-LABEL: trunc_qd_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovqd %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <8 x i64> %i to <8 x i32> store <8 x i32> %x, <8 x i32>* %res @@ -211,11 +226,13 @@ ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qd_256: ; SKX: ## BB#0: ; SKX-NEXT: vpmovqd %ymm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <4 x i64> %i to <4 x i32> ret <4 x i32> %x @@ -227,11 +244,13 @@ ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vmovdqa %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_qd_256_mem: ; SKX: ## BB#0: ; SKX-NEXT: vpmovqd %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <4 x i64> %i to <4 x i32> store <4 x i32> %x, <4 x i32>* %res @@ -266,6 +285,7 @@ ; ALL-LABEL: trunc_db_512: ; ALL: ## BB#0: ; ALL-NEXT: vpmovdb %zmm0, %xmm0 +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <16 x i32> %i to <16 x i8> ret <16 x i8> %x @@ -275,6 +295,7 @@ ; ALL-LABEL: trunc_db_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovdb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <16 x i32> %i to <16 x i8> store <16 x i8> %x, <16 x i8>* %res @@ -287,11 +308,13 @@ ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_db_256: ; SKX: ## BB#0: ; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <8 x i32> %i to <8 x i8> ret <8 x i8> %x @@ -304,11 +327,13 @@ ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_db_256_mem: ; SKX: ## BB#0: ; SKX-NEXT: vpmovdb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <8 x i32> %i to <8 x i8> store <8 x i8> %x, <8 x i8>* %res @@ -352,6 +377,7 @@ ; ALL-LABEL: trunc_dw_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovdw %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x = trunc <16 x i32> %i to <16 x i16> store <16 x i16> %x, <16 x i16>* %res @@ -364,11 +390,13 @@ ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_dw_256: ; SKX: ## BB#0: ; SKX-NEXT: vpmovdw %ymm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <8 x i32> %i to <8 x i16> ret <8 x i16> %x @@ -380,11 +408,13 @@ ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vmovdqa %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_dw_256_mem: ; SKX: ## BB#0: ; SKX-NEXT: vpmovdw %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <8 x i32> %i to <8 x i16> store <8 x i16> %x, <8 x i16>* %res @@ -434,11 +464,13 @@ ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: vmovdqa %ymm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_wb_512_mem: ; SKX: ## BB#0: ; SKX-NEXT: vpmovwb %zmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <32 x i16> %i to <32 x i8> store <32 x i8> %x, <32 x i8>* %res @@ -450,11 +482,13 @@ ; KNL: ## BB#0: ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_wb_256: ; SKX: ## BB#0: ; SKX-NEXT: vpmovwb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <16 x i16> %i to <16 x i8> ret <16 x i8> %x @@ -466,11 +500,13 @@ ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vmovdqa %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_wb_256_mem: ; SKX: ## BB#0: ; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x = trunc <16 x i16> %i to <16 x i8> store <16 x i8> %x, <16 x i8>* %res @@ -509,11 +545,13 @@ ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vmovdqu %xmm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: usat_trunc_wb_256_mem: ; SKX: ## BB#0: ; SKX-NEXT: vpmovuswb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x3 = icmp ult <16 x i16> %i, %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> @@ -528,11 +566,13 @@ ; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: usat_trunc_wb_256: ; SKX: ## BB#0: ; SKX-NEXT: vpmovuswb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x3 = icmp ult <16 x i16> %i, %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> @@ -563,6 +603,7 @@ ; ALL-LABEL: usat_trunc_db_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovusdb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <16 x i32> %i, %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> @@ -575,6 +616,7 @@ ; ALL-LABEL: usat_trunc_qb_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <8 x i64> %i, %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> @@ -587,6 +629,7 @@ ; ALL-LABEL: usat_trunc_qd_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovusqd %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <8 x i64> %i, %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> @@ -599,6 +642,7 @@ ; ALL-LABEL: usat_trunc_qw_512_mem: ; ALL: ## BB#0: ; ALL-NEXT: vpmovusqw %zmm0, (%rdi) +; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <8 x i64> %i, %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> @@ -638,6 +682,7 @@ ; KNL-NEXT: vpmovusdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: vmovdqu %ymm0, (%rdi) +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: usat_trunc_db_1024_mem: @@ -649,6 +694,7 @@ ; SKX-NEXT: vpmovdw %zmm1, %ymm1 ; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; SKX-NEXT: vpmovwb %zmm0, (%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x3 = icmp ult <32 x i32> %i, %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> @@ -714,6 +760,7 @@ ; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: usat_trunc_db_256: @@ -721,6 +768,7 @@ ; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; SKX-NEXT: vpmovdw %ymm0, %xmm0 ; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %tmp1 = icmp ult <8 x i32> %x, %tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> Index: test/CodeGen/X86/avx512-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-vec-cmp.ll +++ test/CodeGen/X86/avx512-vec-cmp.ll @@ -160,13 +160,22 @@ } define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind { -; CHECK-LABEL: test12: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 -; CHECK-NEXT: kunpckbw %k0, %k1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; KNL-LABEL: test12: +; KNL: ## BB#0: +; KNL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; KNL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; KNL-NEXT: kunpckbw %k0, %k1, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test12: +; SKX: ## BB#0: +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; SKX-NEXT: kunpckbw %k0, %k1, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %res = icmp eq <16 x i64> %a, %b %res1 = bitcast <16 x i1> %res to i16 ret i16 %res1 @@ -326,6 +335,7 @@ ; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 ; SKX-NEXT: kunpckwd %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %res = icmp eq <32 x i32> %a, %b %res1 = bitcast <32 x i1> %res to i32 @@ -637,6 +647,7 @@ ; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 ; SKX-NEXT: kunpckdq %k0, %k1, %k0 ; SKX-NEXT: kmovq %k0, %rax +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %res = icmp eq <64 x i16> %a, %b %res1 = bitcast <64 x i1> %res to i64 @@ -892,6 +903,7 @@ ; SKX-NEXT: vpcmpgtd %zmm3, %zmm2, %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x_gt_y = icmp sgt <16 x i32> %x, %y %x1_gt_y1 = icmp sgt <16 x i32> %x1, %y1 Index: test/CodeGen/X86/combine-testm-and.ll =================================================================== --- test/CodeGen/X86/combine-testm-and.ll +++ test/CodeGen/X86/combine-testm-and.ll @@ -6,6 +6,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 ; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %and.i = and <8 x i64> %b, %a %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 -1) @@ -19,6 +20,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} ; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %and.i = and <8 x i64> %b, %a %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask) @@ -32,6 +34,7 @@ ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %b = load <8 x i64>, <8 x i64>* %bptr %and.i = and <8 x i64> %a, %b @@ -46,6 +49,7 @@ ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1} ; CHECK-NEXT: kmovb %k0, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %b = load <8 x i64>, <8 x i64>* %bptr %and.i = and <8 x i64> %b, %a Index: test/CodeGen/X86/compress_expand.ll =================================================================== --- test/CodeGen/X86/compress_expand.ll +++ test/CodeGen/X86/compress_expand.ll @@ -94,12 +94,20 @@ declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>) define void @test6(float* %base, <16 x float> %V) { -; ALL-LABEL: test6: -; ALL: # BB#0: -; ALL-NEXT: movw $-2049, %ax # imm = 0xF7FF -; ALL-NEXT: kmovw %eax, %k1 -; ALL-NEXT: vcompressps %zmm0, (%rdi) {%k1} -; ALL-NEXT: retq +; SKX-LABEL: test6: +; SKX: # BB#0: +; SKX-NEXT: movw $-2049, %ax # imm = 0xF7FF +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k1} +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL-LABEL: test6: +; KNL: # BB#0: +; KNL-NEXT: movw $-2049, %ax # imm = 0xF7FF +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1} +; KNL-NEXT: retq call void @llvm.masked.compressstore.v16f32(<16 x float> %V, float* %base, <16 x i1> ) ret void } @@ -110,6 +118,7 @@ ; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 ; SKX-NEXT: vpmovw2m %xmm1, %k1 ; SKX-NEXT: vcompressps %ymm0, (%rdi) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; KNL-LABEL: test7: @@ -132,6 +141,7 @@ ; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 ; SKX-NEXT: vpmovw2m %xmm1, %k1 ; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; KNL-LABEL: test8: @@ -151,6 +161,7 @@ ; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 ; SKX-NEXT: vpmovw2m %xmm1, %k1 ; SKX-NEXT: vpcompressq %zmm0, (%rdi) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; KNL-LABEL: test9: @@ -170,6 +181,7 @@ ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX-NEXT: vpcompressq %ymm0, (%rdi) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; KNL-LABEL: test10: @@ -341,16 +353,28 @@ } define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) { -; ALL-LABEL: test17: -; ALL: # BB#0: -; ALL-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; ALL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 -; ALL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 -; ALL-NEXT: kmovw %k2, %eax -; ALL-NEXT: popcntl %eax, %eax -; ALL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1} -; ALL-NEXT: vcompressps %zmm0, (%rdi) {%k2} -; ALL-NEXT: retq +; SKX-LABEL: test17: +; SKX: # BB#0: +; SKX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 +; SKX-NEXT: kmovw %k2, %eax +; SKX-NEXT: popcntl %eax, %eax +; SKX-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1} +; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k2} +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL-LABEL: test17: +; KNL: # BB#0: +; KNL-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; KNL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; KNL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2 +; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: popcntl %eax, %eax +; KNL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1} +; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k2} +; KNL-NEXT: retq %mask = icmp eq <32 x i32> %trigger, zeroinitializer call void @llvm.masked.compressstore.v32f32(<32 x float> %V, float* %base, <32 x i1> %mask) ret void @@ -366,6 +390,7 @@ ; SKX-NEXT: popcntl %eax, %eax ; SKX-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} ; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; KNL-LABEL: test18: Index: test/CodeGen/X86/fast-isel-nontemporal.ll =================================================================== --- test/CodeGen/X86/fast-isel-nontemporal.ll +++ test/CodeGen/X86/fast-isel-nontemporal.ll @@ -379,6 +379,7 @@ ; AVX512-LABEL: test_nt8xfloat: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <8 x float> %X, <8 x float>* %ptr, align 32, !nontemporal !1 @@ -401,6 +402,7 @@ ; AVX512-LABEL: test_nt4xdouble: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntpd %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <4 x double> %X, <4 x double>* %ptr, align 32, !nontemporal !1 @@ -423,6 +425,7 @@ ; AVX512-LABEL: test_nt32xi8: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntdq %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <32 x i8> %X, <32 x i8>* %ptr, align 32, !nontemporal !1 @@ -445,6 +448,7 @@ ; AVX512-LABEL: test_nt16xi16: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntdq %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <16 x i16> %X, <16 x i16>* %ptr, align 32, !nontemporal !1 @@ -467,6 +471,7 @@ ; AVX512-LABEL: test_nt8xi32: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntdq %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <8 x i32> %X, <8 x i32>* %ptr, align 32, !nontemporal !1 @@ -489,6 +494,7 @@ ; AVX512-LABEL: test_nt4xi64: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntdq %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <4 x i64> %X, <4 x i64>* %ptr, align 32, !nontemporal !1 @@ -750,6 +756,7 @@ ; AVX512-LABEL: test_nt16xfloat: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntps %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <16 x float> %X, <16 x float>* %ptr, align 64, !nontemporal !1 @@ -775,6 +782,7 @@ ; AVX512-LABEL: test_nt8xdouble: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntpd %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <8 x double> %X, <8 x double>* %ptr, align 64, !nontemporal !1 @@ -801,11 +809,13 @@ ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_nt64xi8: ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq entry: store <64 x i8> %X, <64 x i8>* %ptr, align 64, !nontemporal !1 @@ -832,11 +842,13 @@ ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_nt32xi16: ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq entry: store <32 x i16> %X, <32 x i16>* %ptr, align 64, !nontemporal !1 @@ -862,6 +874,7 @@ ; AVX512-LABEL: test_nt16xi32: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntdq %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <16 x i32> %X, <16 x i32>* %ptr, align 64, !nontemporal !1 @@ -887,6 +900,7 @@ ; AVX512-LABEL: test_nt8xi64: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vmovntdq %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: store <8 x i64> %X, <8 x i64>* %ptr, align 64, !nontemporal !1 Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -233,6 +233,7 @@ ; KNL_64-NEXT: kmovw %k1, %k2 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test5: @@ -242,6 +243,7 @@ ; KNL_32-NEXT: kmovw %k1, %k2 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test5: @@ -250,6 +252,7 @@ ; SKX-NEXT: kmovw %k1, %k2 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test5: @@ -259,6 +262,7 @@ ; SKX_32-NEXT: kmovw %k1, %k2 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 @@ -794,6 +798,7 @@ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} ; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test15: @@ -808,6 +813,7 @@ ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} ; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test15: @@ -904,6 +910,7 @@ ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovapd %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test17: @@ -917,6 +924,7 @@ ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovapd %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test17: @@ -960,6 +968,7 @@ ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test18: @@ -973,6 +982,7 @@ ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test18: @@ -980,6 +990,7 @@ ; SKX-NEXT: vpslld $31, %xmm2, %xmm2 ; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test18: @@ -1006,6 +1017,7 @@ ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test19: @@ -1021,6 +1033,7 @@ ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test19: @@ -1028,6 +1041,7 @@ ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test19: @@ -1036,6 +1050,7 @@ ; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl %gep = getelementptr double, double* %ptr, <4 x i64> %ind call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) @@ -1055,6 +1070,7 @@ ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test20: @@ -1068,6 +1084,7 @@ ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test20: @@ -1078,6 +1095,7 @@ ; SKX-NEXT: kshiftlb $6, %k0, %k0 ; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test20: @@ -1105,6 +1123,7 @@ ; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test21: @@ -1116,6 +1135,7 @@ ; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test21: @@ -1127,6 +1147,7 @@ ; SKX-NEXT: kshiftrb $6, %k0, %k1 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test21: @@ -1138,6 +1159,7 @@ ; SKX_32-NEXT: kshiftrb $6, %k0, %k1 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) ret void @@ -1161,6 +1183,7 @@ ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} ; KNL_64-NEXT: vmovaps %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test22: @@ -1176,6 +1199,7 @@ ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} ; KNL_32-NEXT: vmovaps %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test22: @@ -1221,6 +1245,7 @@ ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23: @@ -1234,6 +1259,7 @@ ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test23: @@ -1266,6 +1292,7 @@ ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test24: @@ -1278,6 +1305,7 @@ ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test24: @@ -1312,6 +1340,7 @@ ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test25: @@ -1325,6 +1354,7 @@ ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test25: @@ -1359,6 +1389,7 @@ ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test26: @@ -1372,6 +1403,7 @@ ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: @@ -1405,6 +1437,7 @@ ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} ; KNL_64-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: @@ -1416,6 +1449,7 @@ ; KNL_32-NEXT: kmovw %ecx, %k1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} ; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test27: @@ -1451,6 +1485,7 @@ ; KNL_64-NEXT: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test28: @@ -1462,6 +1497,7 @@ ; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test28: @@ -1471,6 +1507,7 @@ ; SKX-NEXT: movb $3, %al ; SKX-NEXT: kmovb %eax, %k1 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test28: @@ -1480,6 +1517,7 @@ ; SKX_32-NEXT: movb $3, %al ; SKX_32-NEXT: kmovb %eax, %k1 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) ret void @@ -1852,6 +1890,7 @@ ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_16i32: @@ -1860,6 +1899,7 @@ ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_scatter_16i32: @@ -1871,6 +1911,7 @@ ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_16i32: @@ -1879,6 +1920,7 @@ ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask) ret void @@ -1892,6 +1934,7 @@ ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_16i64: @@ -1916,6 +1959,7 @@ ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_scatter_16i64: @@ -1926,6 +1970,7 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_16i64: @@ -1950,6 +1995,7 @@ ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) ret void @@ -1965,6 +2011,7 @@ ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_16f32: @@ -1973,6 +2020,7 @@ ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_scatter_16f32: @@ -1984,6 +2032,7 @@ ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_16f32: @@ -1992,6 +2041,7 @@ ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) ret void @@ -2006,6 +2056,7 @@ ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} +; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_16f64: @@ -2030,6 +2081,7 @@ ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_scatter_16f64: @@ -2040,6 +2092,7 @@ ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_16f64: @@ -2064,6 +2117,7 @@ ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: vzeroupper ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) ret void @@ -2086,6 +2140,34 @@ ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_64-NEXT: retq ; +; KNL_32-LABEL: test_pr28312: +; KNL_32: # BB#0: +; KNL_32-NEXT: pushl %ebp +; KNL_32-NEXT: .Lcfi12: +; KNL_32-NEXT: .cfi_def_cfa_offset 8 +; KNL_32-NEXT: .Lcfi13: +; KNL_32-NEXT: .cfi_offset %ebp, -8 +; KNL_32-NEXT: movl %esp, %ebp +; KNL_32-NEXT: .Lcfi14: +; KNL_32-NEXT: .cfi_def_cfa_register %ebp +; KNL_32-NEXT: andl $-32, %esp +; KNL_32-NEXT: subl $32, %esp +; KNL_32-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 +; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 +; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 +; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1 +; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL_32-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1} +; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 +; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; KNL_32-NEXT: movl %ebp, %esp +; KNL_32-NEXT: popl %ebp +; KNL_32-NEXT: retl +; ; SKX-LABEL: test_pr28312: ; SKX: # BB#0: ; SKX-NEXT: vpslld $31, %xmm1, %xmm1 @@ -2094,6 +2176,27 @@ ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; SKX-NEXT: retq +; +; SKX_32-LABEL: test_pr28312: +; SKX_32: # BB#0: +; SKX_32-NEXT: pushl %ebp +; SKX_32-NEXT: .Lcfi13: +; SKX_32-NEXT: .cfi_def_cfa_offset 8 +; SKX_32-NEXT: .Lcfi14: +; SKX_32-NEXT: .cfi_offset %ebp, -8 +; SKX_32-NEXT: movl %esp, %ebp +; SKX_32-NEXT: .Lcfi15: +; SKX_32-NEXT: .cfi_def_cfa_register %ebp +; SKX_32-NEXT: andl $-32, %esp +; SKX_32-NEXT: subl $32, %esp +; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 +; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1} +; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 +; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; SKX_32-NEXT: movl %ebp, %esp +; SKX_32-NEXT: popl %ebp +; SKX_32-NEXT: retl %g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) %g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) %g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -415,6 +415,7 @@ ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: test12: @@ -422,6 +423,7 @@ ; SKX-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 ; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) @@ -998,12 +1000,14 @@ ; AVX512F: ## BB#0: ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovlps %xmm0, 16(%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; SKX-LABEL: one_mask_bit_set3: ; SKX: ## BB#0: ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; SKX-NEXT: vmovq %xmm0, 16(%rdi) +; SKX-NEXT: vzeroupper ; SKX-NEXT: retq call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>) ret void @@ -1023,6 +1027,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>) ret void @@ -1042,6 +1047,7 @@ ; AVX512: ## BB#0: ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) ret void Index: test/CodeGen/X86/nontemporal-2.ll =================================================================== --- test/CodeGen/X86/nontemporal-2.ll +++ test/CodeGen/X86/nontemporal-2.ll @@ -255,6 +255,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1 ret void @@ -279,6 +280,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1 ret void @@ -303,6 +305,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1 ret void @@ -327,6 +330,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1 ret void @@ -351,6 +355,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1 ret void @@ -375,6 +380,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1 ret void @@ -757,6 +763,7 @@ ; VLX-LABEL: test_arg_v8f32: ; VLX: # BB#0: ; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1 ret void @@ -778,6 +785,7 @@ ; VLX-LABEL: test_arg_v8i32: ; VLX: # BB#0: ; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1 ret void @@ -799,6 +807,7 @@ ; VLX-LABEL: test_arg_v4f64: ; VLX: # BB#0: ; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1 ret void @@ -820,6 +829,7 @@ ; VLX-LABEL: test_arg_v4i64: ; VLX: # BB#0: ; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1 ret void @@ -841,6 +851,7 @@ ; VLX-LABEL: test_arg_v16i16: ; VLX: # BB#0: ; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1 ret void @@ -862,6 +873,7 @@ ; VLX-LABEL: test_arg_v32i8: ; VLX: # BB#0: ; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1 ret void @@ -1031,6 +1043,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; VLX-NEXT: vmovntps %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq %r = fadd <8 x float> %a, %b store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1 @@ -1068,6 +1081,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq %r = add <8 x i32> %a, %b store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1 @@ -1094,6 +1108,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; VLX-NEXT: vmovntpd %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq %r = fadd <4 x double> %a, %b store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1 @@ -1131,6 +1146,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq %r = add <4 x i64> %a, %b store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1 @@ -1168,6 +1184,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq %r = add <16 x i16> %a, %b store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1 @@ -1205,6 +1222,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; VLX-NEXT: vmovntdq %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq %r = add <32 x i8> %a, %b store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1 @@ -1235,6 +1253,7 @@ ; VLX: # BB#0: ; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; VLX-NEXT: vmovups %ymm0, (%rdi) +; VLX-NEXT: vzeroupper ; VLX-NEXT: retq %r = fadd <8 x float> %a, %b store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1 Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -55,6 +55,7 @@ ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v16i8c: @@ -63,6 +64,7 @@ ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq entry: %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > @@ -195,6 +197,7 @@ ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v16i8: @@ -204,6 +207,7 @@ ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq entry: %A = mul <16 x i8> %i, %j @@ -1168,24 +1172,15 @@ ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX2-LABEL: mul_v4i64_zero_upper: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: mul_v4i64_zero_upper: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX512-NEXT: retq +; AVX-LABEL: mul_v4i64_zero_upper: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %val1a = zext <4 x i32> %val1 to <4 x i64> %val2a = zext <4 x i32> %val2 to <4 x i64> @@ -1237,30 +1232,18 @@ ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; -; AVX2-LABEL: mul_v4i64_zero_upper_left: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: mul_v4i64_zero_upper_left: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX512-NEXT: retq +; AVX-LABEL: mul_v4i64_zero_upper_left: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %val1a = zext <4 x i32> %val1 to <4 x i64> %res64 = mul <4 x i64> %val1a, %val2 @@ -1301,26 +1284,16 @@ ; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX2-LABEL: mul_v4i64_zero_lower: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: mul_v4i64_zero_lower: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX512-NEXT: retq +; AVX-LABEL: mul_v4i64_zero_lower: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %val1a = zext <4 x i32> %val1 to <4 x i64> %val2a = and <4 x i64> %val2, Index: test/CodeGen/X86/pr29112.ll =================================================================== --- test/CodeGen/X86/pr29112.ll +++ test/CodeGen/X86/pr29112.ll @@ -60,6 +60,7 @@ ; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm9, (%rsp) ; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -81,6 +81,7 @@ ; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: sad_16i8: @@ -106,6 +107,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq entry: br label %vector.body @@ -330,6 +332,7 @@ ; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: sad_32i8: @@ -357,6 +360,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq entry: br label %vector.body @@ -808,6 +812,7 @@ ; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: sad_avx64i8: @@ -836,6 +841,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq entry: br label %vector.body @@ -1273,6 +1279,7 @@ ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: sad_nonloop_32i8: @@ -1284,6 +1291,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %v1 = load <32 x i8>, <32 x i8>* %p, align 1 %z1 = zext <32 x i8> %v1 to <32 x i32> Index: test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -31,6 +31,7 @@ ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: @@ -42,6 +43,7 @@ ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: @@ -53,6 +55,7 @@ ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: @@ -64,6 +67,7 @@ ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> @@ -89,6 +93,7 @@ ; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v16i16_to_v16i8: @@ -96,6 +101,7 @@ ; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v16i16_to_v16i8: @@ -103,12 +109,14 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %bc = bitcast <32 x i8> %vec to <16 x i16> @@ -139,6 +147,7 @@ ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: @@ -153,6 +162,7 @@ ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: @@ -164,6 +174,7 @@ ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: @@ -178,6 +189,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -200,12 +212,14 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v8i32_to_v8i16: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v8i32_to_v8i16: @@ -213,12 +227,14 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %bc = bitcast <16 x i16> %vec to <8 x i32> @@ -243,6 +259,7 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512F-NEXT: vmovaps %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: @@ -251,6 +268,7 @@ ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512VL-NEXT: vmovaps %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: @@ -259,6 +277,7 @@ ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: @@ -267,6 +286,7 @@ ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %L %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -288,12 +308,14 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i64_to_v4i32: @@ -301,12 +323,14 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %L %bc = bitcast <8 x i32> %vec to <4 x i64> @@ -337,6 +361,7 @@ ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: @@ -348,6 +373,7 @@ ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: @@ -359,6 +385,7 @@ ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: @@ -373,6 +400,7 @@ ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -397,12 +425,14 @@ ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v8i32_to_v8i8: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v8i32_to_v8i8: @@ -411,12 +441,14 @@ ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %bc = bitcast <32 x i8> %vec to <8 x i32> @@ -449,6 +481,7 @@ ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: @@ -457,6 +490,7 @@ ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: @@ -469,6 +503,7 @@ ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: @@ -477,6 +512,7 @@ ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -500,12 +536,14 @@ ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i16: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i64_to_v4i16: @@ -514,12 +552,14 @@ ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %bc = bitcast <16 x i16> %vec to <4 x i64> @@ -550,6 +590,7 @@ ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: @@ -558,6 +599,7 @@ ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: @@ -569,6 +611,7 @@ ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: @@ -577,6 +620,7 @@ ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -600,12 +644,14 @@ ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i64_to_v4i8: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i64_to_v4i8: @@ -614,12 +660,14 @@ ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %bc = bitcast <32 x i8> %vec to <4 x i64> Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -18,6 +18,7 @@ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8: @@ -29,6 +30,7 @@ ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: @@ -40,6 +42,7 @@ ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: @@ -51,6 +54,7 @@ ; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> @@ -67,6 +71,7 @@ ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_v32i16_to_v32i8: @@ -77,18 +82,21 @@ ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_v32i16_to_v32i8: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %bc = bitcast <64 x i8> %vec to <32 x i16> @@ -107,6 +115,7 @@ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16: @@ -120,6 +129,7 @@ ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16: @@ -131,6 +141,7 @@ ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16: @@ -141,6 +152,7 @@ ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 ; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -153,6 +165,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L %bc = bitcast <32 x i16> %vec to <16 x i32> @@ -169,6 +182,7 @@ ; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -181,6 +195,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L %bc = bitcast <16 x i32> %vec to <8 x i64> @@ -206,6 +221,7 @@ ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8: @@ -224,6 +240,7 @@ ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8: @@ -265,6 +282,7 @@ ; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax ; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8: @@ -306,6 +324,7 @@ ; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax ; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 ; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> @@ -318,6 +337,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %bc = bitcast <64 x i8> %vec to <16 x i32> @@ -345,6 +365,7 @@ ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16: @@ -365,6 +386,7 @@ ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16: @@ -388,6 +410,7 @@ ; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax ; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: @@ -411,6 +434,7 @@ ; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax ; AVX512BWVL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 ; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -423,6 +447,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L %bc = bitcast <32 x i16> %vec to <8 x i64> @@ -448,6 +473,7 @@ ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: @@ -466,6 +492,7 @@ ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: @@ -491,6 +518,7 @@ ; AVX512BW-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 ; AVX512BW-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: @@ -516,6 +544,7 @@ ; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax ; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> @@ -528,6 +557,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %bc = bitcast <64 x i8> %vec to <8 x i64> Index: test/CodeGen/X86/sse-fsignum.ll =================================================================== --- test/CodeGen/X86/sse-fsignum.ll +++ test/CodeGen/X86/sse-fsignum.ll @@ -102,6 +102,7 @@ ; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps %ymm0, (%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq entry: %1 = load <8 x float>, <8 x float>* %0 @@ -161,6 +162,7 @@ ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: vsubpd %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vmovapd %ymm0, (%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq entry: %1 = load <4 x double>, <4 x double>* %0 @@ -178,43 +180,18 @@ ; define void @signum32c(<8 x float>*) { -; AVX1-LABEL: signum32c: -; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: signum32c: -; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: signum32c: -; AVX512F: # BB#0: # %entry -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; AVX512F-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX512F-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vmovaps %ymm0, (%rdi) -; AVX512F-NEXT: retq +; AVX-LABEL: signum32c: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %1 = load <8 x float>, <8 x float>* %0 %2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x float> zeroinitializer, i8 1) @@ -270,6 +247,7 @@ ; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: vmovaps %ymm0, (%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq entry: %x = load <4 x double>, <4 x double>* %0 Index: test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- test/CodeGen/X86/subvector-broadcast.ll +++ test/CodeGen/X86/subvector-broadcast.ll @@ -1341,6 +1341,7 @@ ; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; X32-AVX512-NEXT: vmovdqu %ymm0, _ga4 ; X32-AVX512-NEXT: vmovdqu64 %zmm1, _gb4 +; X32-AVX512-NEXT: vzeroupper ; X32-AVX512-NEXT: retl ; ; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: @@ -1391,6 +1392,7 @@ ; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip) ; X64-AVX512-NEXT: vmovdqu64 %zmm1, {{.*}}(%rip) +; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq entry: %0 = add <4 x i64> %a, @@ -1429,6 +1431,7 @@ ; X32-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 ; X32-AVX512-NEXT: vmovupd %ymm0, _ga2 ; X32-AVX512-NEXT: vmovupd %zmm1, _gb2 +; X32-AVX512-NEXT: vzeroupper ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: @@ -1454,6 +1457,7 @@ ; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 ; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip) ; X64-AVX512-NEXT: vmovupd %zmm1, {{.*}}(%rip) +; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq entry: %0 = fadd <4 x double> %a, Index: test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- test/CodeGen/X86/vec_fp_to_int.ll +++ test/CodeGen/X86/vec_fp_to_int.ll @@ -63,6 +63,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64: @@ -112,18 +113,12 @@ ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_4f64_to_2i32: -; VEX: # BB#0: -; VEX-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VEX-NEXT: vzeroupper -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_4f64_to_2i32: -; AVX512: # BB#0: -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_4f64_to_2i32: +; AVX: # BB#0: +; AVX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> %cvt = fptosi <4 x double> %ext to <4 x i32> ret <4 x i32> %cvt @@ -243,16 +238,11 @@ ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_4f64_to_4i32: -; VEX: # BB#0: -; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VEX-NEXT: vzeroupper -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_4f64_to_4i32: -; AVX512: # BB#0: -; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_4f64_to_4i32: +; AVX: # BB#0: +; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %cvt = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %cvt } @@ -334,6 +324,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64: @@ -400,6 +391,7 @@ ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f64_to_4i32: @@ -412,6 +404,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32: @@ -477,6 +470,7 @@ ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f64_to_2i32: @@ -489,6 +483,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32: @@ -550,12 +545,14 @@ ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_4f64_to_2i32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_4f64_to_2i32: @@ -563,12 +560,14 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> %cvt = fptoui <4 x double> %ext to <4 x i32> @@ -816,11 +815,13 @@ ; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_4f64_to_4i32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: fptoui_4f64_to_4i32: @@ -828,11 +829,13 @@ ; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %cvt = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %cvt @@ -980,12 +983,14 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 ; AVX512VLDQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %cvt = fptosi <4 x float> %a to <4 x i64> %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> @@ -1281,6 +1286,7 @@ ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i32: @@ -1294,6 +1300,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32: @@ -1347,6 +1354,7 @@ ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_4f32_to_4i32: @@ -1359,6 +1367,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32: @@ -1529,12 +1538,14 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0 ; AVX512VLDQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %cvt = fptoui <4 x float> %a to <4 x i64> %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> @@ -2291,6 +2302,7 @@ ; AVX512F-NEXT: vmovq %rax, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f16_to_4i32: @@ -2321,6 +2333,7 @@ ; AVX512DQ-NEXT: vmovq %rax, %xmm0 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32: Index: test/CodeGen/X86/vec_fpext.ll =================================================================== --- test/CodeGen/X86/vec_fpext.ll +++ test/CodeGen/X86/vec_fpext.ll @@ -82,6 +82,7 @@ ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x01] ; X32-AVX512VL-NEXT: vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00] +; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: fpext_frommem4: @@ -103,6 +104,7 @@ ; X64-AVX512VL: # BB#0: # %entry ; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x07] ; X64-AVX512VL-NEXT: vmovups %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06] +; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: %0 = load <4 x float>, <4 x float>* %in @@ -143,6 +145,7 @@ ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x01] ; X32-AVX512VL-NEXT: vmovups %zmm0, (%eax) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00] +; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X32-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: fpext_frommem8: @@ -170,6 +173,7 @@ ; X64-AVX512VL: # BB#0: # %entry ; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x07] ; X64-AVX512VL-NEXT: vmovups %zmm0, (%rsi) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x06] +; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] entry: %0 = load <8 x float>, <8 x float>* %in Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2 @@ -61,6 +60,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64: @@ -92,18 +92,12 @@ ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: sitofp_4i32_to_2f64: -; VEX: # BB#0: -; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VEX-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; VEX-NEXT: vzeroupper -; VEX-NEXT: retq -; -; AVX512-LABEL: sitofp_4i32_to_2f64: -; AVX512: # BB#0: -; AVX512-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 -; AVX512-NEXT: retq +; AVX-LABEL: sitofp_4i32_to_2f64: +; AVX: # BB#0: +; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %cvt = sitofp <4 x i32> %a to <4 x double> %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> ret <2 x double> %shuf @@ -156,6 +150,7 @@ ; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cvt = sitofp <8 x i16> %a to <8 x double> %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> @@ -211,6 +206,7 @@ ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cvt = sitofp <16 x i8> %a to <16 x double> %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> @@ -498,6 +494,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64: @@ -536,6 +533,7 @@ ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_2i32_to_2f64: @@ -548,6 +546,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64: @@ -603,12 +602,14 @@ ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_4i32_to_2f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_4i32_to_2f64: @@ -616,12 +617,14 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 ; AVX512VLDQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %cvt = uitofp <4 x i32> %a to <4 x double> %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> @@ -675,6 +678,7 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cvt = uitofp <8 x i16> %a to <8 x double> %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> @@ -730,6 +734,7 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cvt = uitofp <16 x i8> %a to <16 x double> %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> @@ -1089,6 +1094,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32: @@ -1147,6 +1153,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero: @@ -1212,12 +1219,14 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> %cvt = sitofp <4 x i64> %ext to <4 x float> @@ -1288,6 +1297,7 @@ ; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cvt = sitofp <8 x i16> %a to <8 x float> %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> @@ -1346,6 +1356,7 @@ ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cvt = sitofp <16 x i8> %a to <16 x float> %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> @@ -1421,6 +1432,7 @@ ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_4i64_to_4f32: @@ -1437,6 +1449,7 @@ ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_4i64_to_4f32: @@ -1444,11 +1457,13 @@ ; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %cvt = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %cvt @@ -1697,6 +1712,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32: @@ -1805,6 +1821,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32: @@ -1932,12 +1949,14 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> %cvt = uitofp <4 x i64> %ext to <4 x float> @@ -1982,6 +2001,7 @@ ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_4i32_to_4f32: @@ -1994,6 +2014,7 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32: @@ -2054,6 +2075,7 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cvt = uitofp <8 x i16> %a to <8 x float> %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> @@ -2112,6 +2134,7 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cvt = uitofp <16 x i8> %a to <16 x float> %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> @@ -2335,6 +2358,7 @@ ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_4i64_to_4f32: @@ -2351,6 +2375,7 @@ ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_4i64_to_4f32: @@ -2358,11 +2383,13 @@ ; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq %cvt = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %cvt @@ -2607,6 +2634,7 @@ ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64: @@ -2926,6 +2954,7 @@ ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64: @@ -2967,6 +2996,7 @@ ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_2i32_to_2f64: @@ -2981,6 +3011,7 @@ ; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64: @@ -3416,6 +3447,7 @@ ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: @@ -3433,6 +3465,7 @@ ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: @@ -3440,6 +3473,7 @@ ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32: @@ -4003,6 +4037,7 @@ ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: @@ -4020,6 +4055,7 @@ ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: @@ -4027,6 +4063,7 @@ ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 ; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32: @@ -4079,6 +4116,7 @@ ; AVX512F-NEXT: vmovaps (%rdi), %xmm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_4i32_to_4f32: @@ -4091,6 +4129,7 @@ ; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32: @@ -4810,6 +4849,7 @@ ; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0 ; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX512-NEXT: vmovaps %ymm0, (%rax) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = load %Arguments, %Arguments* %a0, align 1 %2 = extractvalue %Arguments %1, 1 Index: test/CodeGen/X86/vec_uint_to_fp-fastmath.ll =================================================================== --- test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +++ test/CodeGen/X86/vec_uint_to_fp-fastmath.ll @@ -77,6 +77,7 @@ ; AVX512F-NEXT: # kill ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32: Index: test/CodeGen/X86/vector-compare-all_of.ll =================================================================== --- test/CodeGen/X86/vector-compare-all_of.ll +++ test/CodeGen/X86/vector-compare-all_of.ll @@ -70,6 +70,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 %s = sext <4 x i1> %c to <4 x i64> @@ -120,6 +121,7 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: cltq +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -210,6 +212,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 %s = sext <8 x i1> %c to <8 x i32> @@ -261,6 +264,7 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: cwtl +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 %s = sext <8 x i1> %c to <8 x i16> @@ -356,6 +360,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 %s = sext <4 x i1> %c to <4 x i64> @@ -422,6 +427,7 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: cltq +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -527,6 +533,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 %s = sext <8 x i1> %c to <8 x i32> @@ -593,6 +600,7 @@ ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: cwtl +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 %s = sext <8 x i1> %c to <8 x i16> @@ -713,6 +721,7 @@ ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: %AX %AX %EAX +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <16 x i16> %a0, %a1 %s = sext <16 x i1> %c to <16 x i16> @@ -787,6 +796,7 @@ ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: movsbl %al, %eax ; AVX512-NEXT: # kill: %AX %AX %EAX +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <16 x i16> %a0, %a1 %s = sext <16 x i1> %c to <16 x i8> @@ -917,6 +927,7 @@ ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: %AL %AL %EAX +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <32 x i8> %a0, %a1 %s = sext <32 x i1> %c to <32 x i8> Index: test/CodeGen/X86/vector-compare-any_of.ll =================================================================== --- test/CodeGen/X86/vector-compare-any_of.ll +++ test/CodeGen/X86/vector-compare-any_of.ll @@ -68,6 +68,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 %s = sext <4 x i1> %c to <4 x i64> @@ -114,6 +115,7 @@ ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: cltq +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -196,6 +198,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 %s = sext <8 x i1> %c to <8 x i32> @@ -243,6 +246,7 @@ ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: cwtl +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 %s = sext <8 x i1> %c to <8 x i16> @@ -334,6 +338,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 %s = sext <4 x i1> %c to <4 x i64> @@ -394,6 +399,7 @@ ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: cltq +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 %s = sext <4 x i1> %c to <4 x i32> @@ -489,6 +495,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 %s = sext <8 x i1> %c to <8 x i32> @@ -549,6 +556,7 @@ ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: cwtl +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 %s = sext <8 x i1> %c to <8 x i16> @@ -662,6 +670,7 @@ ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: %AX %AX %EAX +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <16 x i16> %a0, %a1 %s = sext <16 x i1> %c to <16 x i16> @@ -730,6 +739,7 @@ ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: movsbl %al, %eax ; AVX512-NEXT: # kill: %AX %AX %EAX +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <16 x i16> %a0, %a1 %s = sext <16 x i1> %c to <16 x i8> @@ -853,6 +863,7 @@ ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax ; AVX512-NEXT: # kill: %AL %AL %EAX +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <32 x i8> %a0, %a1 %s = sext <32 x i1> %c to <32 x i8> Index: test/CodeGen/X86/vector-compare-results.ll =================================================================== --- test/CodeGen/X86/vector-compare-results.ll +++ test/CodeGen/X86/vector-compare-results.ll @@ -146,6 +146,7 @@ ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = fcmp ogt <4 x double> %a0, %a1 ret <4 x i1> %1 @@ -181,6 +182,7 @@ ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = fcmp ogt <8 x float> %a0, %a1 ret <8 x i1> %1 @@ -243,6 +245,7 @@ ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp sgt <4 x i64> %a0, %a1 ret <4 x i1> %1 @@ -279,6 +282,7 @@ ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp sgt <8 x i32> %a0, %a1 ret <8 x i1> %1 @@ -315,6 +319,7 @@ ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v16i16: @@ -322,6 +327,7 @@ ; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v16i16: @@ -329,6 +335,7 @@ ; AVX512BW-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i16> %a0, %a1 ret <16 x i1> %1 @@ -610,6 +617,7 @@ ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v8f64: @@ -617,6 +625,7 @@ ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v8f64: @@ -624,6 +633,7 @@ ; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = fcmp ogt <8 x double> %a0, %a1 ret <8 x i1> %1 @@ -670,6 +680,7 @@ ; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v16f32: @@ -677,6 +688,7 @@ ; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v16f32: @@ -684,6 +696,7 @@ ; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = fcmp ogt <16 x float> %a0, %a1 ret <16 x i1> %1 @@ -781,6 +794,7 @@ ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v8i64: @@ -788,6 +802,7 @@ ; AVX512DQ-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v8i64: @@ -795,6 +810,7 @@ ; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = icmp sgt <8 x i64> %a0, %a1 ret <8 x i1> %1 @@ -844,6 +860,7 @@ ; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v16i32: @@ -851,6 +868,7 @@ ; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v16i32: @@ -858,6 +876,7 @@ ; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i32> %a0, %a1 ret <16 x i1> %1 @@ -1969,6 +1988,7 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm3 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512F-NEXT: vmovdqa %xmm4, %xmm2 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v64i8: @@ -1979,6 +1999,7 @@ ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm3 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm2 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v64i8: @@ -2173,6 +2194,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v16f64: @@ -2288,6 +2310,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v16f64: @@ -2403,6 +2426,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = fcmp ogt <16 x double> %a0, %a1 ret <16 x i1> %1 @@ -3652,6 +3676,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v16i64: @@ -3783,6 +3808,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v16i64: @@ -3914,6 +3940,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i64> %a0, %a1 ret <16 x i1> %1 @@ -6063,6 +6090,7 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512F-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v64i16: @@ -6349,6 +6377,7 @@ ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v64i16: @@ -8416,6 +8445,7 @@ ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, (%rdi) ; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v128i8: @@ -8461,6 +8491,7 @@ ; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: movq %rdi, %rax +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_cmp_v128i8: Index: test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- test/CodeGen/X86/vector-half-conversions.ll +++ test/CodeGen/X86/vector-half-conversions.ll @@ -29,6 +29,7 @@ ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_i16_to_f32: @@ -122,6 +123,7 @@ ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_4i16_to_4f32: @@ -232,6 +234,7 @@ ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_8i16_to_4f32: @@ -880,6 +883,7 @@ ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: load_cvt_i16_to_f32: @@ -950,6 +954,7 @@ ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: load_cvt_4i16_to_4f32: @@ -1053,6 +1058,7 @@ ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: load_cvt_8i16_to_4f32: @@ -1534,6 +1540,7 @@ ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_i16_to_f64: @@ -1598,6 +1605,7 @@ ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_2i16_to_2f64: @@ -1789,6 +1797,7 @@ ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_8i16_to_2f64: @@ -2187,6 +2196,7 @@ ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: load_cvt_i16_to_f64: @@ -2240,6 +2250,7 @@ ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: load_cvt_2i16_to_2f64: @@ -2684,6 +2695,7 @@ ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: # kill: %AX %AX %EAX +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_f32_to_i16: @@ -2769,6 +2781,7 @@ ; AVX512F-NEXT: shlq $32, %rdx ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_4f32_to_4i16: @@ -2874,6 +2887,7 @@ ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_4f32_to_8i16_undef: @@ -2983,6 +2997,7 @@ ; AVX512F-NEXT: orq %rcx, %rdx ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_4f32_to_8i16_zero: @@ -3159,6 +3174,7 @@ ; AVX512F-NEXT: vmovq %rsi, %xmm0 ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: cvt_8f32_to_8i16: @@ -3205,6 +3221,7 @@ ; AVX512VL-NEXT: vmovq %rsi, %xmm0 ; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = fptrunc <8 x float> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> @@ -3511,6 +3528,7 @@ ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: movw %ax, (%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: store_cvt_f32_to_i16: @@ -3582,6 +3600,7 @@ ; AVX512F-NEXT: movw %dx, 6(%rdi) ; AVX512F-NEXT: movw %cx, 4(%rdi) ; AVX512F-NEXT: movw %ax, 2(%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: store_cvt_4f32_to_4i16: @@ -3686,6 +3705,7 @@ ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef: @@ -3800,6 +3820,7 @@ ; AVX512F-NEXT: vmovq %rdx, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero: @@ -3945,6 +3966,7 @@ ; AVX512F-NEXT: movw %r10w, 6(%rdi) ; AVX512F-NEXT: movw %r9w, 4(%rdi) ; AVX512F-NEXT: movw %r8w, 2(%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: store_cvt_8f32_to_8i16: @@ -3980,6 +4002,7 @@ ; AVX512VL-NEXT: movw %r10w, 6(%rdi) ; AVX512VL-NEXT: movw %r9w, 4(%rdi) ; AVX512VL-NEXT: movw %r8w, 2(%rdi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = fptrunc <8 x float> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> @@ -4187,6 +4210,7 @@ ; AVX512F-NEXT: movw %ax, 4(%rdi) ; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: movw %ax, 2(%rdi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: store_cvt_16f32_to_16i16: @@ -4254,6 +4278,7 @@ ; AVX512VL-NEXT: movw %ax, 4(%rdi) ; AVX512VL-NEXT: vmovd %xmm4, %eax ; AVX512VL-NEXT: movw %ax, 2(%rdi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = fptrunc <16 x float> %a0 to <16 x half> %2 = bitcast <16 x half> %1 to <16 x i16> @@ -4379,11 +4404,13 @@ ; AVX512F-NEXT: subq $40, %rsp ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movzwl %ax, %r14d ; AVX512F-NEXT: orl %ebx, %r14d @@ -4391,6 +4418,7 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx @@ -4413,11 +4441,13 @@ ; AVX512VL-NEXT: subq $40, %rsp ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d @@ -4425,6 +4455,7 @@ ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx @@ -4528,11 +4559,13 @@ ; AVX512F-NEXT: subq $40, %rsp ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movzwl %ax, %r14d ; AVX512F-NEXT: orl %ebx, %r14d @@ -4540,6 +4573,7 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx @@ -4563,11 +4597,13 @@ ; AVX512VL-NEXT: subq $40, %rsp ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d @@ -4575,6 +4611,7 @@ ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx @@ -4682,11 +4719,13 @@ ; AVX512F-NEXT: subq $40, %rsp ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movzwl %ax, %r14d ; AVX512F-NEXT: orl %ebx, %r14d @@ -4694,6 +4733,7 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx @@ -4717,11 +4757,13 @@ ; AVX512VL-NEXT: subq $40, %rsp ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movzwl %ax, %r14d ; AVX512VL-NEXT: orl %ebx, %r14d @@ -4729,6 +4771,7 @@ ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx @@ -4897,11 +4940,13 @@ ; AVX512F-NEXT: subq $96, %rsp ; AVX512F-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movzwl %ax, %r15d ; AVX512F-NEXT: orl %ebx, %r15d @@ -4909,6 +4954,7 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx @@ -4922,11 +4968,13 @@ ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movzwl %ax, %r15d ; AVX512F-NEXT: orl %ebx, %r15d @@ -4934,6 +4982,7 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bx ; AVX512F-NEXT: shll $16, %ebx @@ -4960,11 +5009,13 @@ ; AVX512VL-NEXT: subq $96, %rsp ; AVX512VL-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d @@ -4972,6 +5023,7 @@ ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx @@ -4985,11 +5037,13 @@ ; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movzwl %ax, %r15d ; AVX512VL-NEXT: orl %ebx, %r15d @@ -4997,6 +5051,7 @@ ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bx ; AVX512VL-NEXT: shll $16, %ebx @@ -5150,16 +5205,19 @@ ; AVX512F-NEXT: movq %rdi, %rbx ; AVX512F-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movl %eax, %r14d ; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movl %eax, %r15d ; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movl %eax, %ebp ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload @@ -5185,16 +5243,19 @@ ; AVX512VL-NEXT: movq %rdi, %rbx ; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r14d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r15d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %ebp ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload @@ -5309,11 +5370,13 @@ ; AVX512F-NEXT: movq %rdi, %r14 ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bp ; AVX512F-NEXT: shll $16, %ebp ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movzwl %ax, %ebx ; AVX512F-NEXT: orl %ebp, %ebx @@ -5321,6 +5384,7 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bp ; AVX512F-NEXT: shll $16, %ebp @@ -5348,11 +5412,13 @@ ; AVX512VL-NEXT: movq %rdi, %r14 ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bp ; AVX512VL-NEXT: shll $16, %ebp ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx @@ -5360,6 +5426,7 @@ ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bp ; AVX512VL-NEXT: shll $16, %ebp @@ -5480,11 +5547,13 @@ ; AVX512F-NEXT: movq %rdi, %r14 ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bp ; AVX512F-NEXT: shll $16, %ebp ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movzwl %ax, %ebx ; AVX512F-NEXT: orl %ebp, %ebx @@ -5492,6 +5561,7 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, %bp ; AVX512F-NEXT: shll $16, %ebp @@ -5519,11 +5589,13 @@ ; AVX512VL-NEXT: movq %rdi, %r14 ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bp ; AVX512VL-NEXT: shll $16, %ebp ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movzwl %ax, %ebx ; AVX512VL-NEXT: orl %ebp, %ebx @@ -5531,6 +5603,7 @@ ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, %bp ; AVX512VL-NEXT: shll $16, %ebp @@ -5708,28 +5781,33 @@ ; AVX512F-NEXT: movq %rdi, %rbx ; AVX512F-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill ; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill ; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movl %eax, %r12d ; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movl %eax, %r13d ; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movl %eax, %ebp ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload @@ -5737,6 +5815,7 @@ ; AVX512F-NEXT: movl %eax, %r14d ; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 ; AVX512F-NEXT: movl %eax, %r15d ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload @@ -5772,28 +5851,33 @@ ; AVX512VL-NEXT: movq %rdi, %rbx ; AVX512VL-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill ; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r12d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r13d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %ebp ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload @@ -5801,6 +5885,7 @@ ; AVX512VL-NEXT: movl %eax, %r14d ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 ; AVX512VL-NEXT: movl %eax, %r15d ; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload Index: test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- test/CodeGen/X86/vector-rotate-128.ll +++ test/CodeGen/X86/vector-rotate-128.ll @@ -461,6 +461,7 @@ ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: var_rotate_v8i16: @@ -702,6 +703,7 @@ ; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: var_rotate_v16i8: @@ -716,6 +718,7 @@ ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; XOP-LABEL: var_rotate_v16i8: @@ -1085,6 +1088,7 @@ ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: constant_rotate_v8i16: @@ -1272,6 +1276,7 @@ ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; XOP-LABEL: constant_rotate_v16i8: Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -1246,6 +1246,7 @@ ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_sext_2i1_to_2i64: @@ -1254,6 +1255,7 @@ ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_2i1_to_2i64: @@ -1436,6 +1438,7 @@ ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_sext_4i1_to_4i32: @@ -1445,6 +1448,7 @@ ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_4i1_to_4i32: @@ -1941,6 +1945,7 @@ ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_sext_8i1_to_8i16: @@ -1949,6 +1954,7 @@ ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_8i1_to_8i16: @@ -2852,6 +2858,7 @@ ; AVX512-NEXT: kmovw (%rdi), %k1 ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_16i1_to_16i8: Index: test/CodeGen/X86/vector-shuffle-masked.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-masked.ll +++ test/CodeGen/X86/vector-shuffle-masked.ll @@ -243,6 +243,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti32x4 $0, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -257,6 +258,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -271,6 +273,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -285,6 +288,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -299,6 +303,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf32x4 $0, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -313,6 +318,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -327,6 +333,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -341,6 +348,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -407,6 +415,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti64x2 $0, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -421,6 +430,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -435,6 +445,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti64x2 $2, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -449,6 +460,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti64x2 $3, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -463,6 +475,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf64x2 $0, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -477,6 +490,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -491,6 +505,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf64x2 $2, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -505,6 +520,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf64x2 $3, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> %mask.cast = bitcast i8 %mask to <8 x i1> @@ -603,6 +619,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> %shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32> @@ -618,6 +635,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> %shuffle.cast = bitcast <2 x double> %shuffle to <4 x float> @@ -663,6 +681,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> %shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64> @@ -678,6 +697,7 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> %shuffle.cast = bitcast <4 x float> %shuffle to <2 x double> Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -78,6 +78,7 @@ ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: @@ -88,6 +89,7 @@ ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %a2 = icmp eq <8 x i64> %a, %a1 %b2 = icmp eq <8 x i64> %b, %b1 @@ -108,6 +110,7 @@ ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: @@ -120,6 +123,7 @@ ; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %a2 = icmp eq <16 x i32> %a, %a1 %b2 = icmp eq <16 x i32> %b, %b1 @@ -162,6 +166,7 @@ ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u: @@ -172,6 +177,7 @@ ; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> @@ -189,6 +195,7 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: @@ -200,6 +207,7 @@ ; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> @@ -216,6 +224,7 @@ ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: @@ -225,6 +234,7 @@ ; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> @@ -243,6 +253,7 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: @@ -254,6 +265,7 @@ ; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> @@ -272,6 +284,7 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: @@ -283,6 +296,7 @@ ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> @@ -303,6 +317,7 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: @@ -316,6 +331,7 @@ ; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i8 %a to <8 x i1> %c = shufflevector <8 x i1> , <8 x i1> %b, <8 x i32> @@ -336,6 +352,7 @@ ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: @@ -348,6 +365,7 @@ ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 ; VL_BW_DQ-NEXT: kmovb %k0, %eax +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %c = shufflevector <8 x i1> , <8 x i1> %a, <8 x i32> %c1 = bitcast <8 x i1>%c to i8 @@ -364,6 +382,7 @@ ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: @@ -373,6 +392,7 @@ ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovw %k0, %eax +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i16 %a to <16 x i1> %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer @@ -413,6 +433,7 @@ ; AVX512F-NEXT: orq %rcx, %rax ; AVX512F-NEXT: movq %rbp, %rsp ; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf64i1_zero: @@ -422,6 +443,7 @@ ; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0 ; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0 ; VL_BW_DQ-NEXT: kmovq %k0, %rax +; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq %b = bitcast i64 %a to <64 x i1> %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer Index: test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- test/CodeGen/X86/vector-trunc-math.ll +++ test/CodeGen/X86/vector-trunc-math.ll @@ -42,6 +42,7 @@ ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -109,6 +110,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -154,6 +156,7 @@ ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -262,6 +265,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_v16i64_v16i8: @@ -272,6 +276,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_v16i64_v16i8: @@ -282,6 +287,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = add <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -346,6 +352,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -392,6 +399,7 @@ ; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_v16i16_v16i8: @@ -399,6 +407,7 @@ ; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: @@ -406,6 +415,7 @@ ; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = add <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -452,6 +462,7 @@ ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> %2 = sext <8 x i8> %1 to <8 x i32> @@ -492,6 +503,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -549,6 +561,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -590,6 +603,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -676,6 +690,7 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8: @@ -685,6 +700,7 @@ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8: @@ -694,6 +710,7 @@ ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = add <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -749,6 +766,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = add <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -792,6 +810,7 @@ ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: @@ -799,6 +818,7 @@ ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: @@ -806,6 +826,7 @@ ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = add <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -848,6 +869,7 @@ ; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -915,6 +937,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -960,6 +983,7 @@ ; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -1068,6 +1092,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_v16i64_v16i8: @@ -1078,6 +1103,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8: @@ -1088,6 +1114,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -1152,6 +1179,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -1198,6 +1226,7 @@ ; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: @@ -1205,6 +1234,7 @@ ; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: @@ -1212,6 +1242,7 @@ ; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1259,6 +1290,7 @@ ; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -1330,6 +1362,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -1374,6 +1407,7 @@ ; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -1484,6 +1518,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8: @@ -1494,6 +1529,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8: @@ -1504,6 +1540,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -1566,6 +1603,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -1611,6 +1649,7 @@ ; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: @@ -1618,6 +1657,7 @@ ; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: @@ -1625,6 +1665,7 @@ ; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -1688,6 +1729,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: @@ -1697,6 +1739,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: @@ -1706,6 +1749,7 @@ ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -1792,6 +1836,7 @@ ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: @@ -1799,12 +1844,14 @@ ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -1862,6 +1909,7 @@ ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2116,6 +2164,7 @@ ; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: @@ -2128,6 +2177,7 @@ ; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: @@ -2138,6 +2188,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -2226,6 +2277,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -2272,6 +2324,7 @@ ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: @@ -2279,6 +2332,7 @@ ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: @@ -2286,6 +2340,7 @@ ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -2332,6 +2387,7 @@ ; AVX512-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> %2 = zext <8 x i8> %1 to <8 x i32> @@ -2387,6 +2443,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -2444,6 +2501,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -2485,6 +2543,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -2681,6 +2740,7 @@ ; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: @@ -2691,6 +2751,7 @@ ; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: @@ -2701,6 +2762,7 @@ ; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -2791,6 +2853,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = mul <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -2836,6 +2899,7 @@ ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: @@ -2843,6 +2907,7 @@ ; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: @@ -2850,6 +2915,7 @@ ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = mul <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -2890,6 +2956,7 @@ ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -2953,6 +3020,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -2996,6 +3064,7 @@ ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3096,6 +3165,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_v16i64_v16i8: @@ -3106,6 +3176,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_v16i64_v16i8: @@ -3116,6 +3187,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = and <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3176,6 +3248,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3220,6 +3293,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_v16i16_v16i8: @@ -3227,6 +3301,7 @@ ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: @@ -3234,6 +3309,7 @@ ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = and <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -3272,6 +3348,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3329,6 +3406,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -3370,6 +3448,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3456,6 +3535,7 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8: @@ -3465,6 +3545,7 @@ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8: @@ -3474,6 +3555,7 @@ ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = and <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3529,6 +3611,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = and <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3572,6 +3655,7 @@ ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: @@ -3579,6 +3663,7 @@ ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: @@ -3586,6 +3671,7 @@ ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = and <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -3626,6 +3712,7 @@ ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -3689,6 +3776,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -3732,6 +3820,7 @@ ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -3832,6 +3921,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_v16i64_v16i8: @@ -3842,6 +3932,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8: @@ -3852,6 +3943,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = xor <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -3912,6 +4004,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -3956,6 +4049,7 @@ ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: @@ -3963,6 +4057,7 @@ ; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: @@ -3970,6 +4065,7 @@ ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = xor <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4008,6 +4104,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4065,6 +4162,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -4106,6 +4204,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4192,6 +4291,7 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8: @@ -4201,6 +4301,7 @@ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8: @@ -4210,6 +4311,7 @@ ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = xor <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -4265,6 +4367,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = xor <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -4308,6 +4411,7 @@ ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: @@ -4315,6 +4419,7 @@ ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: @@ -4322,6 +4427,7 @@ ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = xor <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4362,6 +4468,7 @@ ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, %a1 %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4425,6 +4532,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, %a1 %2 = trunc <8 x i64> %1 to <8 x i16> @@ -4468,6 +4576,7 @@ ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, %a1 %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4568,6 +4677,7 @@ ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_v16i64_v16i8: @@ -4578,6 +4688,7 @@ ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_v16i64_v16i8: @@ -4588,6 +4699,7 @@ ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = or <16 x i64> %a0, %a1 %2 = trunc <16 x i64> %1 to <16 x i8> @@ -4648,6 +4760,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <16 x i32> %a0, %a1 %2 = trunc <16 x i32> %1 to <16 x i8> @@ -4692,6 +4805,7 @@ ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_v16i16_v16i8: @@ -4699,6 +4813,7 @@ ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: @@ -4706,6 +4821,7 @@ ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = or <16 x i16> %a0, %a1 %2 = trunc <16 x i16> %1 to <16 x i8> @@ -4744,6 +4860,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <4 x i64> %a0, %2 = trunc <4 x i64> %1 to <4 x i32> @@ -4801,6 +4918,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, %2 = trunc <8 x i64> %1 to <8 x i16> @@ -4842,6 +4960,7 @@ ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <8 x i32> %a0, %2 = trunc <8 x i32> %1 to <8 x i16> @@ -4928,6 +5047,7 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8: @@ -4937,6 +5057,7 @@ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8: @@ -4946,6 +5067,7 @@ ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = or <16 x i64> %a0, %2 = trunc <16 x i64> %1 to <16 x i8> @@ -5001,6 +5123,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = or <16 x i32> %a0, %2 = trunc <16 x i32> %1 to <16 x i8> @@ -5044,6 +5167,7 @@ ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: @@ -5051,6 +5175,7 @@ ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: @@ -5058,6 +5183,7 @@ ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = or <16 x i16> %a0, %2 = trunc <16 x i16> %1 to <16 x i8> Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -120,6 +120,7 @@ ; AVX512-LABEL: trunc8i64_8i16: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: %0 = trunc <8 x i64> %a to <8 x i16> @@ -175,6 +176,7 @@ ; AVX512-LABEL: trunc8i64_8i8: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpmovqb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: %0 = trunc <8 x i64> %a to <8 x i8> @@ -231,11 +233,13 @@ ; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc8i32_8i16: ; AVX512VL: # BB#0: # %entry ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc8i32_8i16: @@ -243,11 +247,13 @@ ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc8i32_8i16: ; AVX512BWVL: # BB#0: # %entry ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: %0 = trunc <8 x i32> %a to <8 x i16> @@ -309,11 +315,13 @@ ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovq %xmm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc8i32_8i8: ; AVX512VL: # BB#0: # %entry ; AVX512VL-NEXT: vpmovdb %ymm0, (%rax) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc8i32_8i8: @@ -322,11 +330,13 @@ ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovq %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc8i32_8i8: ; AVX512BWVL: # BB#0: # %entry ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: %0 = trunc <8 x i32> %a to <8 x i8> @@ -411,6 +421,7 @@ ; AVX512-LABEL: trunc16i32_16i16: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpmovdw %zmm0, (%rax) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: %0 = trunc <16 x i32> %a to <16 x i16> @@ -466,6 +477,7 @@ ; AVX512-LABEL: trunc16i32_16i8: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpmovdb %zmm0, (%rax) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: %0 = trunc <16 x i32> %a to <16 x i8> @@ -529,6 +541,7 @@ ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovdqu %xmm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc16i16_16i8: @@ -536,6 +549,7 @@ ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vmovdqu %xmm0, (%rax) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc16i16_16i8: @@ -543,11 +557,13 @@ ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc16i16_16i8: ; AVX512BWVL: # BB#0: # %entry ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: %0 = trunc <16 x i16> %a to <16 x i8> @@ -635,6 +651,7 @@ ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc32i16_32i8: @@ -645,16 +662,19 @@ ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqu %ymm0, (%rax) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc32i16_32i8: ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: vpmovwb %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc32i16_32i8: ; AVX512BWVL: # BB#0: # %entry ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: %0 = trunc <32 x i16> %a to <32 x i8> @@ -810,6 +830,7 @@ ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc2x4i64_8i16: @@ -823,6 +844,7 @@ ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc2x4i64_8i16: @@ -835,6 +857,7 @@ ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2x4i64_8i16: @@ -848,6 +871,7 @@ ; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq entry: %0 = trunc <4 x i64> %a to <4 x i16> Index: test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-128.ll +++ test/CodeGen/X86/vector-tzcnt-128.ll @@ -281,6 +281,7 @@ ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv2i64u: @@ -696,6 +697,7 @@ ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv4i32u: