diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2125,51 +2125,83 @@ return TargetLoweringBase::getPreferredVectorAction(VT); } +static std::pair +handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, + const X86Subtarget &Subtarget) { + // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling + // convention is one that uses k registers. + if (NumElts == 2) + return {MVT::v2i64, 1}; + if (NumElts == 4) + return {MVT::v4i32, 1}; + if (NumElts == 8 && CC != CallingConv::X86_RegCall && + CC != CallingConv::Intel_OCL_BI) + return {MVT::v8i16, 1}; + if (NumElts == 16 && CC != CallingConv::X86_RegCall && + CC != CallingConv::Intel_OCL_BI) + return {MVT::v16i8, 1}; + // v32i1 passes in ymm unless we have BWI and the calling convention is + // regcall. + if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall)) + return {MVT::v32i8, 1}; + // Split v64i1 vectors if we don't have v64i8 available. + if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) { + if (Subtarget.useAVX512Regs()) + return {MVT::v64i8, 1}; + return {MVT::v32i8, 2}; + } + + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) || + NumElts > 64) + return {MVT::i8, NumElts}; + + return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0}; +} + MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // v32i1 vectors should be promoted to v32i8 to match avx2. - if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) - return MVT::v32i8; - // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512() && - (!isPowerOf2_32(VT.getVectorNumElements()) || - (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || - (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) - return MVT::i8; - // Split v64i1 vectors if we don't have v64i8 available. - if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && - CC != CallingConv::X86_RegCall) - return MVT::v32i1; + Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); + + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return RegisterVT; + } + // FIXME: Should we just make these types legal and custom split operations? if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) return MVT::v16i32; + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // v32i1 vectors should be promoted to v32i8 to match avx2. - if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) - return 1; - // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512() && - (!isPowerOf2_32(VT.getVectorNumElements()) || - (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || - (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) - return VT.getVectorNumElements(); - // Split v64i1 vectors if we don't have v64i8 available. - if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && - CC != CallingConv::X86_RegCall) - return 2; + Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); + + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return NumRegisters; + } + // FIXME: Should we just make these types legal and custom split operations? if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) return 1; + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -2180,8 +2212,8 @@ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512() && (!isPowerOf2_32(VT.getVectorNumElements()) || - (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || - (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { + (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) || + VT.getVectorNumElements() > 64)) { RegisterVT = MVT::i8; IntermediateVT = MVT::i1; NumIntermediates = VT.getVectorNumElements(); @@ -2191,7 +2223,7 @@ // Split v64i1 vectors if we don't have v64i8 available. if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && CC != CallingConv::X86_RegCall) { - RegisterVT = MVT::v32i1; + RegisterVT = MVT::v32i8; IntermediateVT = MVT::v32i1; NumIntermediates = 2; return 2; diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -2710,3 +2710,561 @@ %q = and <7 x i1> %p, %i ret <7 x i1> %q } + +declare void @v2i1_mem_callee(<128 x i32> %x, <2 x i1> %y) +define void @v2i1_mem(<128 x i32> %x, <2 x i1> %y) { +; KNL-LABEL: v2i1_mem: +; KNL: ## %bb.0: +; KNL-NEXT: subq $24, %rsp +; KNL-NEXT: .cfi_def_cfa_offset 32 +; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: callq _v2i1_mem_callee +; KNL-NEXT: addq $24, %rsp +; KNL-NEXT: retq +; +; SKX-LABEL: v2i1_mem: +; SKX: ## %bb.0: +; SKX-NEXT: subq $24, %rsp +; SKX-NEXT: .cfi_def_cfa_offset 32 +; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: callq _v2i1_mem_callee +; SKX-NEXT: addq $24, %rsp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL_X32-LABEL: v2i1_mem: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %ebp +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: .cfi_offset %ebp, -8 +; KNL_X32-NEXT: movl %esp, %ebp +; KNL_X32-NEXT: .cfi_def_cfa_register %ebp +; KNL_X32-NEXT: andl $-64, %esp +; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 +; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: calll _v2i1_mem_callee +; KNL_X32-NEXT: movl %ebp, %esp +; KNL_X32-NEXT: popl %ebp +; KNL_X32-NEXT: retl + call void @v2i1_mem_callee(<128 x i32> %x, <2 x i1> %y) + ret void +} + +declare void @v4i1_mem_callee(<128 x i32> %x, <4 x i1> %y) +define void @v4i1_mem(<128 x i32> %x, <4 x i1> %y) { +; KNL-LABEL: v4i1_mem: +; KNL: ## %bb.0: +; KNL-NEXT: subq $24, %rsp +; KNL-NEXT: .cfi_def_cfa_offset 32 +; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: callq _v4i1_mem_callee +; KNL-NEXT: addq $24, %rsp +; KNL-NEXT: retq +; +; SKX-LABEL: v4i1_mem: +; SKX: ## %bb.0: +; SKX-NEXT: subq $24, %rsp +; SKX-NEXT: .cfi_def_cfa_offset 32 +; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: callq _v4i1_mem_callee +; SKX-NEXT: addq $24, %rsp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL_X32-LABEL: v4i1_mem: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %ebp +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: .cfi_offset %ebp, -8 +; KNL_X32-NEXT: movl %esp, %ebp +; KNL_X32-NEXT: .cfi_def_cfa_register %ebp +; KNL_X32-NEXT: andl $-64, %esp +; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 +; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: calll _v4i1_mem_callee +; KNL_X32-NEXT: movl %ebp, %esp +; KNL_X32-NEXT: popl %ebp +; KNL_X32-NEXT: retl + call void @v4i1_mem_callee(<128 x i32> %x, <4 x i1> %y) + ret void +} + +declare void @v8i1_mem_callee(<128 x i32> %x, <8 x i1> %y) +define void @v8i1_mem(<128 x i32> %x, <8 x i1> %y) { +; KNL-LABEL: v8i1_mem: +; KNL: ## %bb.0: +; KNL-NEXT: subq $24, %rsp +; KNL-NEXT: .cfi_def_cfa_offset 32 +; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: callq _v8i1_mem_callee +; KNL-NEXT: addq $24, %rsp +; KNL-NEXT: retq +; +; SKX-LABEL: v8i1_mem: +; SKX: ## %bb.0: +; SKX-NEXT: subq $24, %rsp +; SKX-NEXT: .cfi_def_cfa_offset 32 +; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: callq _v8i1_mem_callee +; SKX-NEXT: addq $24, %rsp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL_X32-LABEL: v8i1_mem: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %ebp +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: .cfi_offset %ebp, -8 +; KNL_X32-NEXT: movl %esp, %ebp +; KNL_X32-NEXT: .cfi_def_cfa_register %ebp +; KNL_X32-NEXT: andl $-64, %esp +; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 +; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: calll _v8i1_mem_callee +; KNL_X32-NEXT: movl %ebp, %esp +; KNL_X32-NEXT: popl %ebp +; KNL_X32-NEXT: retl + call void @v8i1_mem_callee(<128 x i32> %x, <8 x i1> %y) + ret void +} + +declare void @v16i1_mem_callee(<128 x i32> %x, <16 x i1> %y) +define void @v16i1_mem(<128 x i32> %x, <16 x i1> %y) { +; KNL-LABEL: v16i1_mem: +; KNL: ## %bb.0: +; KNL-NEXT: subq $24, %rsp +; KNL-NEXT: .cfi_def_cfa_offset 32 +; KNL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 +; KNL-NEXT: vmovaps %xmm8, (%rsp) +; KNL-NEXT: callq _v16i1_mem_callee +; KNL-NEXT: addq $24, %rsp +; KNL-NEXT: retq +; +; SKX-LABEL: v16i1_mem: +; SKX: ## %bb.0: +; SKX-NEXT: subq $24, %rsp +; SKX-NEXT: .cfi_def_cfa_offset 32 +; SKX-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm8 +; SKX-NEXT: vmovaps %xmm8, (%rsp) +; SKX-NEXT: callq _v16i1_mem_callee +; SKX-NEXT: addq $24, %rsp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL_X32-LABEL: v16i1_mem: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %ebp +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: .cfi_offset %ebp, -8 +; KNL_X32-NEXT: movl %esp, %ebp +; KNL_X32-NEXT: .cfi_def_cfa_register %ebp +; KNL_X32-NEXT: andl $-64, %esp +; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 +; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovaps 264(%ebp), %xmm4 +; KNL_X32-NEXT: vmovaps %xmm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: calll _v16i1_mem_callee +; KNL_X32-NEXT: movl %ebp, %esp +; KNL_X32-NEXT: popl %ebp +; KNL_X32-NEXT: retl + call void @v16i1_mem_callee(<128 x i32> %x, <16 x i1> %y) + ret void +} + +declare void @v32i1_mem_callee(<128 x i32> %x, <32 x i1> %y) +define void @v32i1_mem(<128 x i32> %x, <32 x i1> %y) { +; KNL-LABEL: v32i1_mem: +; KNL: ## %bb.0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rsp, %rbp +; KNL-NEXT: .cfi_def_cfa_register %rbp +; KNL-NEXT: andq $-32, %rsp +; KNL-NEXT: subq $64, %rsp +; KNL-NEXT: vmovaps 16(%rbp), %ymm8 +; KNL-NEXT: vmovaps %ymm8, (%rsp) +; KNL-NEXT: callq _v32i1_mem_callee +; KNL-NEXT: movq %rbp, %rsp +; KNL-NEXT: popq %rbp +; KNL-NEXT: retq +; +; SKX-LABEL: v32i1_mem: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-32, %rsp +; SKX-NEXT: subq $64, %rsp +; SKX-NEXT: vmovaps 16(%rbp), %ymm8 +; SKX-NEXT: vmovaps %ymm8, (%rsp) +; SKX-NEXT: callq _v32i1_mem_callee +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL_X32-LABEL: v32i1_mem: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %ebp +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: .cfi_offset %ebp, -8 +; KNL_X32-NEXT: movl %esp, %ebp +; KNL_X32-NEXT: .cfi_def_cfa_register %ebp +; KNL_X32-NEXT: andl $-64, %esp +; KNL_X32-NEXT: subl $384, %esp ## imm = 0x180 +; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 +; KNL_X32-NEXT: vmovaps 264(%ebp), %ymm4 +; KNL_X32-NEXT: vmovaps %ymm4, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: calll _v32i1_mem_callee +; KNL_X32-NEXT: movl %ebp, %esp +; KNL_X32-NEXT: popl %ebp +; KNL_X32-NEXT: retl + call void @v32i1_mem_callee(<128 x i32> %x, <32 x i1> %y) + ret void +} + +declare void @v64i1_mem_callee(<128 x i32> %x, <64 x i1> %y) +define void @v64i1_mem(<128 x i32> %x, <64 x i1> %y) { +; KNL-LABEL: v64i1_mem: +; KNL: ## %bb.0: +; KNL-NEXT: subq $472, %rsp ## imm = 0x1D8 +; KNL-NEXT: .cfi_def_cfa_offset 480 +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax +; KNL-NEXT: movl %eax, (%rsp) +; KNL-NEXT: callq _v64i1_mem_callee +; KNL-NEXT: addq $472, %rsp ## imm = 0x1D8 +; KNL-NEXT: retq +; +; SKX-LABEL: v64i1_mem: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbp +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbp, -16 +; SKX-NEXT: movq %rsp, %rbp +; SKX-NEXT: .cfi_def_cfa_register %rbp +; SKX-NEXT: andq $-64, %rsp +; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: vmovaps 16(%rbp), %zmm8 +; SKX-NEXT: vmovaps %zmm8, (%rsp) +; SKX-NEXT: callq _v64i1_mem_callee +; SKX-NEXT: movq %rbp, %rsp +; SKX-NEXT: popq %rbp +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL_X32-LABEL: v64i1_mem: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %ebp +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: .cfi_offset %ebp, -8 +; KNL_X32-NEXT: movl %esp, %ebp +; KNL_X32-NEXT: .cfi_def_cfa_register %ebp +; KNL_X32-NEXT: andl $-64, %esp +; KNL_X32-NEXT: subl $576, %esp ## imm = 0x240 +; KNL_X32-NEXT: vmovaps 8(%ebp), %zmm4 +; KNL_X32-NEXT: vmovaps 72(%ebp), %zmm5 +; KNL_X32-NEXT: vmovaps 136(%ebp), %zmm6 +; KNL_X32-NEXT: vmovaps 200(%ebp), %zmm7 +; KNL_X32-NEXT: movl 516(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 512(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 508(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 504(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 500(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 496(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 492(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 488(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 484(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 480(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 476(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 472(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 468(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 464(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 460(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 456(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 452(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 448(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 444(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 440(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 436(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 432(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 428(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 424(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 420(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 416(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 412(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 408(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 404(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 400(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 396(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 392(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 388(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 384(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 380(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 376(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 372(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 368(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 364(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 360(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 356(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 352(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 348(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 344(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 340(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 336(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 332(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 328(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 324(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 320(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 316(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 312(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 308(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 304(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 300(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 296(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 292(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 288(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 284(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 280(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 276(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 272(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 268(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: movl 264(%ebp), %eax +; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm7, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm6, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm5, {{[0-9]+}}(%esp) +; KNL_X32-NEXT: vmovaps %zmm4, (%esp) +; KNL_X32-NEXT: calll _v64i1_mem_callee +; KNL_X32-NEXT: movl %ebp, %esp +; KNL_X32-NEXT: popl %ebp +; KNL_X32-NEXT: retl + call void @v64i1_mem_callee(<128 x i32> %x, <64 x i1> %y) + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -314,31 +314,40 @@ ; ; WIN64-LABEL: test_argv32i1: ; WIN64: # %bb.0: # %entry +; WIN64-NEXT: pushq %rbp +; WIN64-NEXT: .seh_pushreg %rbp ; WIN64-NEXT: pushq %r11 ; WIN64-NEXT: .seh_pushreg %r11 ; WIN64-NEXT: pushq %r10 ; WIN64-NEXT: .seh_pushreg %r10 ; WIN64-NEXT: pushq %rsp ; WIN64-NEXT: .seh_pushreg %rsp -; WIN64-NEXT: subq $32, %rsp -; WIN64-NEXT: .seh_stackalloc 32 +; WIN64-NEXT: subq $152, %rsp +; WIN64-NEXT: .seh_stackalloc 152 +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp +; WIN64-NEXT: .seh_setframe %rbp, 128 ; WIN64-NEXT: .seh_endprologue +; WIN64-NEXT: andq $-32, %rsp ; WIN64-NEXT: kmovd %edx, %k0 -; WIN64-NEXT: kmovd %ecx, %k1 -; WIN64-NEXT: kmovd %eax, %k2 +; WIN64-NEXT: kmovd %eax, %k1 +; WIN64-NEXT: kmovd %ecx, %k2 ; WIN64-NEXT: vpmovm2b %k2, %zmm0 -; WIN64-NEXT: vpmovm2b %k1, %zmm1 -; WIN64-NEXT: vpmovm2b %k0, %zmm2 -; WIN64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; WIN64-NEXT: # kill: def $ymm1 killed $ymm1 killed $zmm1 -; WIN64-NEXT: # kill: def $ymm2 killed $ymm2 killed $zmm2 +; WIN64-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: vpmovm2b %k1, %zmm0 +; WIN64-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: vpmovm2b %k0, %zmm0 +; WIN64-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %r8 +; WIN64-NEXT: vzeroupper ; WIN64-NEXT: callq test_argv32i1helper ; WIN64-NEXT: nop -; WIN64-NEXT: addq $32, %rsp +; WIN64-NEXT: leaq 24(%rbp), %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: popq %r10 ; WIN64-NEXT: popq %r11 -; WIN64-NEXT: vzeroupper +; WIN64-NEXT: popq %rbp ; WIN64-NEXT: retq ; WIN64-NEXT: .seh_handlerdata ; WIN64-NEXT: .text @@ -552,22 +561,25 @@ ; WIN64-NEXT: .seh_pushreg %r10 ; WIN64-NEXT: pushq %rsp ; WIN64-NEXT: .seh_pushreg %rsp -; WIN64-NEXT: subq $32, %rsp -; WIN64-NEXT: .seh_stackalloc 32 +; WIN64-NEXT: subq $80, %rsp +; WIN64-NEXT: .seh_stackalloc 80 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: kmovd %edx, %k0 -; WIN64-NEXT: kmovd %ecx, %k1 -; WIN64-NEXT: kmovd %eax, %k2 +; WIN64-NEXT: kmovd %eax, %k1 +; WIN64-NEXT: kmovd %ecx, %k2 ; WIN64-NEXT: vpmovm2b %k2, %zmm0 -; WIN64-NEXT: vpmovm2b %k1, %zmm1 -; WIN64-NEXT: vpmovm2b %k0, %zmm2 -; WIN64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; WIN64-NEXT: # kill: def $xmm1 killed $xmm1 killed $zmm1 -; WIN64-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; WIN64-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: vpmovm2b %k1, %zmm0 +; WIN64-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: vpmovm2b %k0, %zmm0 +; WIN64-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; WIN64-NEXT: vzeroupper ; WIN64-NEXT: callq test_argv16i1helper ; WIN64-NEXT: nop -; WIN64-NEXT: addq $32, %rsp +; WIN64-NEXT: addq $80, %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: popq %r10 ; WIN64-NEXT: popq %r11 @@ -789,22 +801,25 @@ ; WIN64-NEXT: .seh_pushreg %r10 ; WIN64-NEXT: pushq %rsp ; WIN64-NEXT: .seh_pushreg %rsp -; WIN64-NEXT: subq $32, %rsp -; WIN64-NEXT: .seh_stackalloc 32 +; WIN64-NEXT: subq $80, %rsp +; WIN64-NEXT: .seh_stackalloc 80 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: kmovd %edx, %k0 -; WIN64-NEXT: kmovd %ecx, %k1 -; WIN64-NEXT: kmovd %eax, %k2 +; WIN64-NEXT: kmovd %eax, %k1 +; WIN64-NEXT: kmovd %ecx, %k2 ; WIN64-NEXT: vpmovm2w %k2, %zmm0 -; WIN64-NEXT: vpmovm2w %k1, %zmm1 -; WIN64-NEXT: vpmovm2w %k0, %zmm2 -; WIN64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; WIN64-NEXT: # kill: def $xmm1 killed $xmm1 killed $zmm1 -; WIN64-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; WIN64-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: vpmovm2w %k1, %zmm0 +; WIN64-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: vpmovm2w %k0, %zmm0 +; WIN64-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %r8 ; WIN64-NEXT: vzeroupper ; WIN64-NEXT: callq test_argv8i1helper ; WIN64-NEXT: nop -; WIN64-NEXT: addq $32, %rsp +; WIN64-NEXT: addq $80, %rsp ; WIN64-NEXT: popq %rsp ; WIN64-NEXT: popq %r10 ; WIN64-NEXT: popq %r11