diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -935,7 +935,10 @@ ; calls @llvm.eh.unwind.init [if needs FP] [for all callee-saved XMM registers] - movaps %, -MMM(%rbp) + [if funclet] + movaps %, -MMM(%rsp) + [else] + movaps %, -MMM(%rbp) [for all callee-saved XMM registers] .seh_savexmm %, (-MMM + SEHFrameOffset) ; i.e. the offset relative to (%rbp - SEHFrameOffset) @@ -955,7 +958,10 @@ ; Emit CFI info [if needs FP] [for all callee-saved registers] - .cfi_offset %, (offset from %rbp) + [if funclet] + movaps -MMM(%rsp), % + [else] + .cfi_offset %, (offset from %rbp) [else] .cfi_def_cfa_offset (offset from RETADDR) [for all callee-saved registers] @@ -1177,11 +1183,16 @@ MFI.setOffsetAdjustment(-StackSize); } - // For EH funclets, only allocate enough space for outgoing calls. Save the - // NumBytes value that we would've used for the parent frame. + // For EH funclets, only allocate enough space for outgoing calls and callee + // saved XMM registers on Windows 64 bits. Save the NumBytes value that we + // would've used for the parent frame. + int XMMFrameSlotOrigin; unsigned ParentFrameNumBytes = NumBytes; - if (IsFunclet) + if (IsFunclet) { NumBytes = getWinEHFuncletFrameSize(MF); + if (IsWin64Prologue) + NumBytes += X86FI->getCalleeSavedXMMFrameInfo(XMMFrameSlotOrigin); + } // Skip the callee-saved push instructions. bool PushedRegs = false; @@ -1389,23 +1400,33 @@ } while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { - const MachineInstr &FrameInstr = *MBBI; + auto FrameInstr = MBBI; ++MBBI; if (NeedsWinCFI) { int FI; - if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { + if (unsigned Reg = TII.isStoreToStackSlot(*FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { - unsigned IgnoredFrameReg; - int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); - Offset += SEHFrameOffset; HasWinCFI = true; - assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) - .addImm(Reg) - .addImm(Offset) - .setMIFlag(MachineInstr::FrameSetup); + if (IsFunclet) { + assert(IsWin64Prologue && "Only valid on Windows 64bit"); + int Offset = (FI - XMMFrameSlotOrigin - 1) * 16; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOVAPSmr)), + StackPtr, true, Offset + (NumBytes & -16)) + .addReg(Reg) + .setMIFlag(MachineInstr::FrameSetup); + MBB.erase(FrameInstr); + } + else { + assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); + unsigned IgnoredFrameReg; + int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset + SEHFrameOffset) + .setMIFlag(MachineInstr::FrameSetup); + } } } } @@ -1949,6 +1970,8 @@ X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned CalleeSavedFrameSize = 0; + unsigned CalleeSavedXMMFrameSize = 0; + int CalleeSavedXMMSlotOrigin = 0; int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); @@ -2032,8 +2055,17 @@ int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); MFI.ensureMaxAlignment(Align); + // We assume XMM slot is consecutive in stack, so we just need to record + // the first one + if (CalleeSavedXMMFrameSize == 0) { + CalleeSavedXMMSlotOrigin = SlotIndex; + } + CalleeSavedXMMFrameSize += 16; } + X86FI->setCalleeSavedXMMFrameInfo(CalleeSavedXMMFrameSize, + CalleeSavedXMMSlotOrigin); + return true; } @@ -2165,21 +2197,40 @@ DebugLoc DL = MBB.findDebugLoc(MI); // Reload XMMs from stack frame. + MachineFunction &MF = *MBB.getParent(); + X86MachineFunctionInfo *X86FI = MF.getInfo(); + int XMMFrameSlotOrigin; + int XMMFrameSlotSize = X86FI->getCalleeSavedXMMFrameInfo(XMMFrameSlotOrigin); + int SEHFrameOffset = XMMFrameSlotSize + + MF.getFrameInfo().getMaxCallFrameSize(); for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); - if (X86::GR64RegClass.contains(Reg) || - X86::GR32RegClass.contains(Reg)) - continue; + if (MBB.isEHFuncletEntry() && STI.is64Bit()) { + if (X86::FR64RegClass.contains(Reg)) { + int Offset = (CSI[i].getFrameIdx() - XMMFrameSlotOrigin - 1) * 16; + addRegOffset(BuildMI(MBB, MI, DL, TII.get(X86::MOVAPSrm), Reg), + X86::RSP, true, SEHFrameOffset + Offset); + } + } + else { + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; - // If this is k-register make sure we lookup via the largest legal type. - MVT VT = MVT::Other; - if (X86::VK16RegClass.contains(Reg)) - VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; + // If this is k-register make sure we lookup via the largest legal type. + MVT VT = MVT::Other; + if (X86::VK16RegClass.contains(Reg)) + VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); + } } + if (MBB.isEHFuncletEntry() && STI.is64Bit()) + BuildMI(MBB, MI, DL, TII.get(X86::ADD64ri8), X86::RSP) + .addReg(X86::RSP) + .addImm(XMMFrameSlotSize); + // POP GPRs. unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -40,6 +40,14 @@ /// stack frame in bytes. unsigned CalleeSavedFrameSize = 0; + /// CalleeSavedXMMFrameSize - Size of the callee-saved XMM register portion + /// of the stack frame in bytes. + unsigned CalleeSavedXMMFrameSize = 0; + + /// CalleeSavedXMMFrameOrigin - Origin slot of the callee-saved XMM register + /// portion of the stack frame. + int CalleeSavedXMMFrameOrigin = 0; + /// BytesToPopOnReturn - Number of bytes function pops on return (in addition /// to the space used by the return address). /// Used on windows platform for stdcall & fastcall name decoration @@ -123,6 +131,11 @@ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } + unsigned getCalleeSavedXMMFrameInfo(int &origin) const + { origin = CalleeSavedXMMFrameOrigin; return CalleeSavedXMMFrameSize; } + void setCalleeSavedXMMFrameInfo(unsigned size, int origin) + { CalleeSavedXMMFrameSize = size; CalleeSavedXMMFrameOrigin = origin; } + unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;} diff --git a/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll b/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll --- a/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll +++ b/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll @@ -51,3 +51,18 @@ ; CHECK: popq %rbp ; CHECK: retq ; CHECK: .seh_handlerdata +; CHECK: # %catch +; CHECK: movq %rdx, 16(%rsp) +; CHECK: pushq %rbp +; CHECK: .seh_pushreg 5 +; CHECK: subq $48, %rsp +; CHECK: .seh_stackalloc 48 +; CHECK: leaq 64(%rdx), %rbp +; CHECK: movapd %xmm6, 32(%rsp) +; CHECK: .seh_endprologue +; CHECK: movapd 32(%rsp), %xmm6 +; CHECK: addq $16, %rsp +; CHECK: leaq .LBB0_1(%rip), %rax +; CHECK: addq $32, %rsp +; CHECK: popq %rbp +; CHECK: retq # CATCHRET diff --git a/llvm/test/CodeGen/X86/win64-funlet-savexmm.ll b/llvm/test/CodeGen/X86/win64-funlet-savexmm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/win64-funlet-savexmm.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s + +; void foo(void) +; { +; __asm("nop" ::: "bx", "cx", "xmm5", "xmm6", "xmm7"); +; try { +; throw; +; } +; catch (int x) { +; } +; } + +%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] } +%eh.ThrowInfo = type { i32, i8*, i8*, i8* } + +$"??_R0H@8" = comdat any + +@"??_7type_info@@6B@" = external constant i8* +@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat + +declare dso_local i32 @__CxxFrameHandler3(...) +declare dso_local x86_stdcallcc void @_CxxThrowException(i8*, %eh.ThrowInfo*) + +define dso_local void @"?foo@@YAXXZ"() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { +entry: + %x = alloca i32, align 4 + call void asm sideeffect "nop", "~{bx},~{cx},~{xmm5},~{xmm6},~{xmm7}"() + invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) + to label %unreachable unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %1 = catchpad within %0 [%rtti.TypeDescriptor2* @"??_R0H@8", i32 0, i32* %x] + catchret from %1 to label %catchret.dest + +catchret.dest: ; preds = %catch + br label %try.cont + +try.cont: ; preds = %catchret.dest + ret void + +unreachable: ; preds = %entry + unreachable +} + +; CHECK: # %catch +; CHECK: movq %rdx, 16(%rsp) +; CHECK: pushq %rbp +; CHECK: .seh_pushreg 5 +; CHECK: pushq %rbx +; CHECK: .seh_pushreg 3 +; CHECK: subq $72, %rsp +; CHECK: .seh_stackalloc 72 +; CHECK: leaq 80(%rdx), %rbp +; CHECK: movaps %xmm7, 48(%rsp) +; CHECK: movaps %xmm6, 32(%rsp) +; CHECK: .seh_endprologue +; CHECK: movaps 32(%rsp), %xmm6 +; CHECK: movaps 48(%rsp), %xmm7 +; CHECK: addq $32, %rsp +; CHECK: leaq .LBB0_3(%rip), %rax +; CHECK: addq $40, %rsp +; CHECK: popq %rbx +; CHECK: popq %rbp +; CHECK: retq # CATCHRET