diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -217,13 +217,23 @@ const DebugLoc &DL, uint64_t Offset) const; +public: /// Emit a stub to later inline the target stack probe. + MachineInstr * + emitStackProbeInlineStubFromReg(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, unsigned Reg) const; + MachineInstr * + emitStackProbeInlineStubFromImm(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, uint64_t Imm) const; MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const; +private: /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -275,13 +275,8 @@ // Delegate stack probing to the `inlineStackProbe` mechanism to avoid // complications. - MachineInstr *Stub = emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + emitStackProbeInlineStubFromImm(MF, MBB, MBBI, DL, Offset); - // Encode the static offset as a metadata attached to the stub. - LLVMContext &Context = MF.getFunction().getContext(); - MachineInstrBuilder(MF, Stub).addMetadata( - MDTuple::get(Context, {ConstantAsMetadata::get(ConstantInt::get( - IntegerType::get(Context, 64), Offset))})); return; } else if (Offset > Chunk) { // Rather than emit a long series of instructions for large offsets, @@ -568,31 +563,33 @@ void X86FrameLowering::emitStackProbeInlineGeneric( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { - MachineInstr &CallToInline = *std::prev(MBBI); - assert(CallToInline.getOperand(1).isMetadata() && - "no metadata attached to that probe"); - uint64_t Offset = - cast( - cast( - cast(CallToInline.getOperand(1).getMetadata()) - ->getOperand(0)) - ->getValue()) - ->getZExtValue(); + + MachineInstr &SizeHolder = *std::prev(std::prev(MBBI)); const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) && "different expansion expected for CoreCLR 64 bit"); + if (SizeHolder.getOperand(1).isReg()) { + emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, + SizeHolder.getOperand(1)); + SizeHolder.eraseFromParent(); + return; + } const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); uint64_t ProbeChunk = StackProbeSize * 8; + uint64_t Offset = SizeHolder.getOperand(1).getImm(); // Synthesize a loop or unroll it, depending on the number of iterations. if (Offset > ProbeChunk) { - emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset); + emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, + SizeHolder.getOperand(1)); } else { emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset); } + SizeHolder.eraseFromParent(); + return; } void X86FrameLowering::emitStackProbeInlineGenericBlock( @@ -641,7 +638,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - uint64_t Offset) const { + MachineOperand const &OffsetOperand) const { const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); @@ -665,12 +662,20 @@ .setMIFlag(MachineInstr::FrameSetup); // save loop bound - { + if (OffsetOperand.isImm()) { + auto Offset = OffsetOperand.getImm(); const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackPtr) .addReg(FinalStackPtr) .addImm(Offset / StackProbeSize * StackProbeSize) .setMIFlag(MachineInstr::FrameSetup); + } else { + auto Offset = OffsetOperand.getReg(); + const unsigned Opc = getSUBrrOpcode(Uses64BitFramePtr); + BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackPtr) + .addReg(FinalStackPtr) + .addReg(Offset) + .setMIFlag(MachineInstr::FrameSetup); } // allocate a page @@ -698,7 +703,7 @@ // jump BuildMI(testMBB, DL, TII.get(X86::JCC_1)) .addMBB(testMBB) - .addImm(X86::COND_NE) + .addImm(X86::COND_L) .setMIFlag(MachineInstr::FrameSetup); testMBB->addSuccessor(testMBB); testMBB->addSuccessor(tailMBB); @@ -710,11 +715,10 @@ tailMBB->transferSuccessorsAndUpdatePHIs(&MBB); MBB.addSuccessor(testMBB); - if (Offset % StackProbeSize) { - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); - BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addImm(Offset % StackProbeSize) + // handle tail + if (OffsetOperand.isReg() || OffsetOperand.getImm() % StackProbeSize) { + BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(X86::MOV64rr), StackPtr) + .addReg(FinalStackPtr) .setMIFlag(MachineInstr::FrameSetup); } } @@ -1026,6 +1030,23 @@ .addExternalSymbol("__chkstk_stub"); } +MachineInstr *X86FrameLowering::emitStackProbeInlineStubFromImm( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + uint64_t Offset) const { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX).addImm(Offset); + return BuildMI(MBB, MBBI, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__chkstk_stub"); +} + +MachineInstr *X86FrameLowering::emitStackProbeInlineStubFromReg( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned Reg) const { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(Reg); + return BuildMI(MBB, MBBI, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__chkstk_stub"); +} + static unsigned calculateSetFPREG(uint64_t SPAdjust) { // Win64 ABI has a less restrictive limitation of 240; 128 works equally well // and might require smaller successive adjustments. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31545,97 +31545,19 @@ MachineBasicBlock * X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { - MachineFunction *MF = BB->getParent(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + MachineFunction &MF = *BB->getParent(); const X86FrameLowering &TFI = *Subtarget.getFrameLowering(); DebugLoc DL = MI.getDebugLoc(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - - const unsigned ProbeSize = getStackProbeSize(*MF); - MachineRegisterInfo &MRI = MF->getRegInfo(); - MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB); - - MachineFunction::iterator MBBIter = ++BB->getIterator(); - MF->insert(MBBIter, testMBB); - MF->insert(MBBIter, blockMBB); - MF->insert(MBBIter, tailMBB); - - unsigned sizeVReg = MI.getOperand(1).getReg(); - - const TargetRegisterClass *SizeRegClass = MRI.getRegClass(sizeVReg); - - unsigned tmpSizeVReg = MRI.createVirtualRegister(SizeRegClass); - unsigned tmpSizeVReg2 = MRI.createVirtualRegister(SizeRegClass); - - unsigned physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP; - - // test rsp size - BuildMI(testMBB, DL, TII->get(X86::PHI), tmpSizeVReg) - .addReg(sizeVReg) - .addMBB(BB) - .addReg(tmpSizeVReg2) - .addMBB(blockMBB); - - BuildMI(testMBB, DL, - TII->get(TFI.Uses64BitFramePtr ? X86::CMP64ri32 : X86::CMP32ri)) - .addReg(tmpSizeVReg) - .addImm(ProbeSize); - - BuildMI(testMBB, DL, TII->get(X86::JCC_1)) - .addMBB(tailMBB) - .addImm(X86::COND_L); - testMBB->addSuccessor(blockMBB); - testMBB->addSuccessor(tailMBB); - - // allocate a block and touch it - - BuildMI(blockMBB, DL, - TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri), - tmpSizeVReg2) - .addReg(tmpSizeVReg) - .addImm(ProbeSize); - - BuildMI(blockMBB, DL, - TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri), - physSPReg) - .addReg(physSPReg) - .addImm(ProbeSize); - - const unsigned MovMIOpc = - TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi; - addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0) - .addImm(0); - - BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB); - blockMBB->addSuccessor(testMBB); - - // allocate the tail and continue - BuildMI(tailMBB, DL, - TII->get(TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr), - physSPReg) - .addReg(physSPReg) - .addReg(tmpSizeVReg); - - // touch the tail too, as we don't have any information about the context - addRegOffset(BuildMI(tailMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0) - .addImm(0); - - BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(physSPReg); - - tailMBB->splice(tailMBB->end(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - tailMBB->transferSuccessorsAndUpdatePHIs(BB); - BB->addSuccessor(testMBB); + MachineBasicBlock::iterator MBBI(MI); + MachineInstr *Stub = TFI.emitStackProbeInlineStubFromReg( + MF, *BB, MBBI, DL, MI.getOperand(1).getReg()); // Delete the original pseudo instruction. - MI.eraseFromParent(); + // MI.removeFromParent(); // And we're done. - return tailMBB; + return BB; } MachineBasicBlock * diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll --- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll +++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll @@ -8,32 +8,31 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: leaq 15(,%rax,4), %rax -; CHECK-NEXT: andq $-16, %rax -; CHECK-NEXT: cmpq $4096, %rax # imm = 0x1000 -; CHECK-NEXT: jl .LBB0_3 -; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $4096, %rax # imm = 0x1000 -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq $4096, %rax # imm = 0x1000 -; CHECK-NEXT: jge .LBB0_2 -; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: subq %rax, %rsp -; CHECK-NEXT: movq %rsp, %rax +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: leaq 15(,%rax,4), %rax +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: subq %rax, %r11 +; CHECK-NEXT:.LBB0_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 ; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: movl $1, 4792(%rax) -; CHECK-NEXT: movl (%rax), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq +; CHECK-NEXT: cmpq %r11, %rsp +; CHECK-NEXT: jle .LBB0_1 +; CHECK-NEXT:# %bb.2: +; CHECK-NEXT: movq %r11, %rsp +; CHECK-NEXT: # variable sized alloca with probing +; CHECK-NEXT: movq %rcx, %rsp +; CHECK-NEXT: movl $1, 4792(%rcx) +; CHECK-NEXT: movl (%rcx), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + %a = alloca i32, i32 %n, align 16 %b = getelementptr inbounds i32, i32* %a, i64 1198