diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -273,9 +273,8 @@ // allocation is split in smaller chunks anyway. if (EmitInlineStackProbe && !InEpilogue) { - // stack probing may involve looping, and control flow generations is - // disallowed at this point. Rely to later processing through - // `inlineStackProbe`. + // Delegate stack probing to the `inlineStackProbe` mechanism to avoid + // complications. MachineInstr *Stub = emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); // Encode the static offset as a metadata attached to the stub. @@ -643,6 +642,7 @@ MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset) const { + assert(Offset && "null offset"); const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); @@ -661,7 +661,7 @@ MF.insert(MBBIter, tailMBB); unsigned FinalStackPtr = Uses64BitFramePtr ? X86::R11 : X86::R11D; - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FinalStackPtr) + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackPtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); @@ -691,7 +691,7 @@ .setMIFlag(MachineInstr::FrameSetup); // cmp with stack pointer bound - BuildMI(testMBB, DL, TII.get(IsLP64 ? X86::CMP64rr : X86::CMP32rr)) + BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) .addReg(StackPtr) .addReg(FinalStackPtr) .setMIFlag(MachineInstr::FrameSetup); @@ -699,23 +699,22 @@ // jump BuildMI(testMBB, DL, TII.get(X86::JCC_1)) .addMBB(testMBB) - .addImm(X86::COND_NE) + .addImm(X86::COND_L) .setMIFlag(MachineInstr::FrameSetup); testMBB->addSuccessor(testMBB); testMBB->addSuccessor(tailMBB); testMBB->addLiveIn(FinalStackPtr); - // allocate a block and touch it - + // BB management tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end()); tailMBB->transferSuccessorsAndUpdatePHIs(&MBB); MBB.addSuccessor(testMBB); + // handle tail if (Offset % StackProbeSize) { - const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); - BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addImm(Offset % StackProbeSize) + BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(TargetOpcode::COPY), + StackPtr) + .addReg(FinalStackPtr) .setMIFlag(MachineInstr::FrameSetup); } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31542,14 +31542,26 @@ return SinkMBB; } +static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::SUB64ri8; + return X86::SUB64ri32; + } else { + if (isInt<8>(Imm)) + return X86::SUB32ri8; + return X86::SUB32ri; + } +} + MachineBasicBlock * X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, - MachineBasicBlock *BB) const { - MachineFunction *MF = BB->getParent(); + MachineBasicBlock *MBB) const { + MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const X86FrameLowering &TFI = *Subtarget.getFrameLowering(); DebugLoc DL = MI.getDebugLoc(); - const BasicBlock *LLVM_BB = BB->getBasicBlock(); + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); const unsigned ProbeSize = getStackProbeSize(*MF); @@ -31558,31 +31570,35 @@ MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator MBBIter = ++BB->getIterator(); + MachineFunction::iterator MBBIter = ++MBB->getIterator(); MF->insert(MBBIter, testMBB); MF->insert(MBBIter, blockMBB); MF->insert(MBBIter, tailMBB); - unsigned sizeVReg = MI.getOperand(1).getReg(); + Register sizeVReg = MI.getOperand(1).getReg(); - const TargetRegisterClass *SizeRegClass = MRI.getRegClass(sizeVReg); + Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP; - unsigned tmpSizeVReg = MRI.createVirtualRegister(SizeRegClass); - unsigned tmpSizeVReg2 = MRI.createVirtualRegister(SizeRegClass); + Register TmpStackPtr = MRI.createVirtualRegister( + TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass); + Register FinalStackPtr = MRI.createVirtualRegister( + TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass); - unsigned physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP; + BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr) + .addReg(physSPReg); + { + const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr; + BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr) + .addReg(TmpStackPtr) + .addReg(sizeVReg); + } // test rsp size - BuildMI(testMBB, DL, TII->get(X86::PHI), tmpSizeVReg) - .addReg(sizeVReg) - .addMBB(BB) - .addReg(tmpSizeVReg2) - .addMBB(blockMBB); BuildMI(testMBB, DL, - TII->get(TFI.Uses64BitFramePtr ? X86::CMP64ri32 : X86::CMP32ri)) - .addReg(tmpSizeVReg) - .addImm(ProbeSize); + TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) + .addReg(physSPReg) + .addReg(FinalStackPtr); BuildMI(testMBB, DL, TII->get(X86::JCC_1)) .addMBB(tailMBB) @@ -31593,14 +31609,7 @@ // allocate a block and touch it BuildMI(blockMBB, DL, - TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri), - tmpSizeVReg2) - .addReg(tmpSizeVReg) - .addImm(ProbeSize); - - BuildMI(blockMBB, DL, - TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri), - physSPReg) + TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg) .addReg(physSPReg) .addImm(ProbeSize); @@ -31612,19 +31621,14 @@ BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB); blockMBB->addSuccessor(testMBB); - // allocate the tail and continue - BuildMI(tailMBB, DL, - TII->get(TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr), - physSPReg) - .addReg(physSPReg) - .addReg(tmpSizeVReg); + // Replace original instruction by the expected stack ptr BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(physSPReg); + .addReg(FinalStackPtr); - tailMBB->splice(tailMBB->end(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); - tailMBB->transferSuccessorsAndUpdatePHIs(BB); - BB->addSuccessor(testMBB); + tailMBB->splice(tailMBB->end(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + tailMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(testMBB); // Delete the original pseudo instruction. MI.eraseFromParent(); diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll --- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll +++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll @@ -1,39 +1,7 @@ -; RUN: llc < %s | FileCheck %s - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s +; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s define i32 @foo(i32 %n) local_unnamed_addr #0 { - -; CHECK-LABEL: foo: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: leaq 15(,%rax,4), %rax -; CHECK-NEXT: andq $-16, %rax -; CHECK-NEXT: cmpq $4096, %rax # imm = 0x1000 -; CHECK-NEXT: jl .LBB0_3 -; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $4096, %rax # imm = 0x1000 -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq $4096, %rax # imm = 0x1000 -; CHECK-NEXT: jge .LBB0_2 -; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: subq %rax, %rsp -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movl $1, 4792(%rax) -; CHECK-NEXT: movl (%rax), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - %a = alloca i32, i32 %n, align 16 %b = getelementptr inbounds i32, i32* %a, i64 1198 store volatile i32 1, i32* %b @@ -42,3 +10,62 @@ } attributes #0 = {"probe-stack"="inline-asm"} + +; CHECK-X86-64-LABEL: foo: +; CHECK-X86-64: # %bb.0: +; CHECK-X86-64-NEXT: pushq %rbp +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-X86-64-NEXT: .cfi_offset %rbp, -16 +; CHECK-X86-64-NEXT: movq %rsp, %rbp +; CHECK-X86-64-NEXT: .cfi_def_cfa_register %rbp +; CHECK-X86-64-NEXT: movq %rsp, %rax +; CHECK-X86-64-NEXT: movl %edi, %ecx +; CHECK-X86-64-NEXT: leaq 15(,%rcx,4), %rcx +; CHECK-X86-64-NEXT: andq $-16, %rcx +; CHECK-X86-64-NEXT: subq %rcx, %rax +; CHECK-X86-64-NEXT: cmpq %rax, %rsp +; CHECK-X86-64-NEXT: jl .LBB0_3 +; CHECK-X86-64-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-X86-64-NEXT: movq $0, (%rsp) +; CHECK-X86-64-NEXT: cmpq %rax, %rsp +; CHECK-X86-64-NEXT: jge .LBB0_2 +; CHECK-X86-64-NEXT: .LBB0_3: +; CHECK-X86-64-NEXT: movq %rax, %rsp +; CHECK-X86-64-NEXT: movl $1, 4792(%rax) +; CHECK-X86-64-NEXT: movl (%rax), %eax +; CHECK-X86-64-NEXT: movq %rbp, %rsp +; CHECK-X86-64-NEXT: popq %rbp +; CHECK-X86-64-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-X86-64-NEXT: retq + + +; CHECK-X86-32-LABEL: foo: +; CHECK-X86-32: # %bb.0: +; CHECK-X86-32-NEXT: pushl %ebp +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 8 +; CHECK-X86-32-NEXT: .cfi_offset %ebp, -8 +; CHECK-X86-32-NEXT: movl %esp, %ebp +; CHECK-X86-32-NEXT: .cfi_def_cfa_register %ebp +; CHECK-X86-32-NEXT: subl $8, %esp +; CHECK-X86-32-NEXT: movl 8(%ebp), %ecx +; CHECK-X86-32-NEXT: movl %esp, %eax +; CHECK-X86-32-NEXT: leal 15(,%ecx,4), %ecx +; CHECK-X86-32-NEXT: andl $-16, %ecx +; CHECK-X86-32-NEXT: subl %ecx, %eax +; CHECK-X86-32-NEXT: cmpl %eax, %esp +; CHECK-X86-32-NEXT: jl .LBB0_3 +; CHECK-X86-32-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X86-32-NEXT: movl $0, (%esp) +; CHECK-X86-32-NEXT: cmpl %eax, %esp +; CHECK-X86-32-NEXT: jge .LBB0_2 +; CHECK-X86-32-NEXT: .LBB0_3: +; CHECK-X86-32-NEXT: movl %eax, %esp +; CHECK-X86-32-NEXT: movl $1, 4792(%eax) +; CHECK-X86-32-NEXT: movl (%eax), %eax +; CHECK-X86-32-NEXT: movl %ebp, %esp +; CHECK-X86-32-NEXT: popl %ebp +; CHECK-X86-32-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-X86-32-NEXT: retl + diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll --- a/llvm/test/CodeGen/X86/stack-clash-large.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -1,31 +1,8 @@ -; RUN: llc < %s | FileCheck %s - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s +; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s define i32 @foo() local_unnamed_addr #0 { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: subq $69632, %r11 # imm = 0x11000 -; CHECK-NEXT: .LBB0_1: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq %r11, %rsp -; CHECK-NEXT: jne .LBB0_1 -; CHECK-NEXT:# %bb.2: -; CHECK-NEXT: subq $2248, %rsp # imm = 0x8C8 -; CHECK-NEXT: .cfi_def_cfa_offset 71888 -; CHECK-NEXT: movl $1, 264(%rsp) -; CHECK-NEXT: movl $1, 28664(%rsp) -; CHECK-NEXT: movl -128(%rsp), %eax -; CHECK-NEXT: addq $71880, %rsp # imm = 0x118C8 -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq - - %a = alloca i32, i64 18000, align 16 %b0 = getelementptr inbounds i32, i32* %a, i64 98 %b1 = getelementptr inbounds i32, i32* %a, i64 7198 @@ -36,3 +13,41 @@ } attributes #0 = {"probe-stack"="inline-asm"} + +; CHECK-X86-64-LABEL: foo: +; CHECK-X86-64: # %bb.0: +; CHECK-X86-64-NEXT: movq %rsp, %r11 +; CHECK-X86-64-NEXT: subq $69632, %r11 # imm = 0x11000 +; CHECK-X86-64-NEXT: .LBB0_1: +; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-X86-64-NEXT: movq $0, (%rsp) +; CHECK-X86-64-NEXT: cmpq %r11, %rsp +; CHECK-X86-64-NEXT: jl .LBB0_1 +; CHECK-X86-64-NEXT:# %bb.2: +; CHECK-X86-64-NEXT: movq %r11, %rsp +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 71888 +; CHECK-X86-64-NEXT: movl $1, 264(%rsp) +; CHECK-X86-64-NEXT: movl $1, 28664(%rsp) +; CHECK-X86-64-NEXT: movl -128(%rsp), %eax +; CHECK-X86-64-NEXT: addq $71880, %rsp # imm = 0x118C8 +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 8 +; CHECK-X86-64-NEXT: retq + +; CHECK-X86-32-LABEL: foo: +; CHECK-X86-32: # %bb.0: +; CHECK-X86-32-NEXT: movl %esp, %r11d +; CHECK-X86-32-NEXT: subl $69632, %r11d # imm = 0x11000 +; CHECK-X86-32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X86-32-NEXT: movl $0, (%esp) +; CHECK-X86-32-NEXT: cmpl %r11d, %esp +; CHECK-X86-32-NEXT: jl .LBB0_1 +; CHECK-X86-32-NEXT:# %bb.2: +; CHECK-X86-32-NEXT: movl %r11d, %esp +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 72016 +; CHECK-X86-32-NEXT: movl $1, 392(%esp) +; CHECK-X86-32-NEXT: movl $1, 28792(%esp) +; CHECK-X86-32-NEXT: movl (%esp), %eax +; CHECK-X86-32-NEXT: addl $72012, %esp # imm = 0x1194C +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 4 +; CHECK-X86-32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/stack-clash-medium.ll b/llvm/test/CodeGen/X86/stack-clash-medium.ll --- a/llvm/test/CodeGen/X86/stack-clash-medium.ll +++ b/llvm/test/CodeGen/X86/stack-clash-medium.ll @@ -1,25 +1,7 @@ -; RUN: llc < %s | FileCheck %s - - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" +; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s +; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s define i32 @foo() local_unnamed_addr #0 { - -; CHECK-LABEL: foo: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 -; CHECK-NEXT: .cfi_def_cfa_offset 7888 -; CHECK-NEXT: movl $1, 672(%rsp) -; CHECK-NEXT: movl -128(%rsp), %eax -; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq - - - %a = alloca i32, i64 2000, align 16 %b = getelementptr inbounds i32, i32* %a, i64 200 store volatile i32 1, i32* %b @@ -28,3 +10,28 @@ } attributes #0 = {"probe-stack"="inline-asm"} + +; CHECK-X86-64-LABEL: foo: +; CHECK-X86-64: # %bb.0: +; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-X86-64-NEXT: movq $0, (%rsp) +; CHECK-X86-64-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-X86-64-NEXT: movl $1, 672(%rsp) +; CHECK-X86-64-NEXT: movl -128(%rsp), %eax +; CHECK-X86-64-NEXT: addq $7880, %rsp # imm = 0x1EC8 +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 8 +; CHECK-X86-64-NEXT: retq + + +; CHECK-X86-32-LABEL: foo: +; CHECK-X86-32: # %bb.0: +; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X86-32-NEXT: movl $0, (%esp) +; CHECK-X86-32-NEXT: subl $3916, %esp # imm = 0xF4C +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 8016 +; CHECK-X86-32-NEXT: movl $1, 800(%esp) +; CHECK-X86-32-NEXT: movl (%esp), %eax +; CHECK-X86-32-NEXT: addl $8012, %esp # imm = 0x1F4C +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 4 +; CHECK-X86-32-NEXT: retl