Index: lib/Target/X86/X86CallingConv.cpp =================================================================== --- lib/Target/X86/X86CallingConv.cpp +++ lib/Target/X86/X86CallingConv.cpp @@ -162,7 +162,11 @@ // created on top of the basic 32 bytes of win64. // It can happen if the fifth or sixth argument is vector type or HVA. // At that case for each argument a shadow stack of 8 bytes is allocated. - if (Reg == X86::XMM4 || Reg == X86::XMM5) + const TargetRegisterInfo *TRI = + State.getMachineFunction().getSubtarget().getRegisterInfo(); + if (Reg == X86::XMM4 || Reg == X86::XMM5 || + X86::XMM4 == TRI->getSubReg(Reg, X86::sub_xmm) || + X86::XMM5 == TRI->getSubReg(Reg, X86::sub_xmm)) State.AllocateStack(8, 8); if (!ArgFlags.isHva()) { Index: test/CodeGen/X86/x86-64-veccallcc.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/x86-64-veccallcc.ll @@ -0,0 +1,15 @@ +; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s + +; Test 1st and 2nd arguments passed in YMM0 and YMM1. +; Test 7nd argument passed by reference in stack: 56(%rsp). +define x86_vectorcallcc <8 x float> @test_m256_7(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, <8 x float> %e, <8 x float> %f, <8 x float> %g) #0 { + ; CHECK-LABEL: test_m256_7@@224: + ; CHECK: movq 56(%rsp), %rax + ; CHECK: vaddps %ymm1, %ymm0, %ymm0 + ; CHECK: vsubps (%rax), %ymm0, %ymm0 + %add.i = fadd <8 x float> %a, %b + %sub.i = fsub <8 x float> %add.i, %g + ret <8 x float> %sub.i +} + +attributes #0 = { nounwind "target-cpu"="core-avx2" }