diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp --- a/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -982,8 +982,24 @@ MachineInstr &MI = *I; unsigned STReturns = 0; + bool ClobbersFPStack = false; for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &Op = MI.getOperand(i); + // Check if this call clobbers the FP stack. + // is sufficient. + if (Op.isRegMask()) { + bool ClobbersFP0 = Op.clobbersPhysReg(X86::FP0); +#ifndef NDEBUG + static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers"); + for (unsigned i = 1; i != 8; ++i) + assert(Op.clobbersPhysReg(X86::FP0 + i) == ClobbersFP0 && + "Inconsistent FP register clobber"); +#endif + + if (ClobbersFP0) + ClobbersFPStack = true; + } + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) continue; @@ -998,6 +1014,14 @@ --e; } + // Most calls should have a regmask that clobbers the FP registers. If it + // isn't present then the register allocator didn't spill the FP registers + // so they are still on the stack. + assert((ClobbersFPStack || STReturns == 0) && + "ST returns without FP stack clobber"); + if (!ClobbersFPStack) + return; + unsigned N = countTrailingOnes(STReturns); // FP registers used for function return must be consecutive starting at diff --git a/llvm/test/CodeGen/X86/pr50782.ll b/llvm/test/CodeGen/X86/pr50782.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr50782.ll @@ -0,0 +1,121 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-w64-windows-gnu | FileCheck %s + +@a = global i32 0, align 4 +@b = global float 0.000000e+00, align 4 +@d = global float 0.000000e+00, align 4 +@f = global i32 0, align 4 +@g = global float 0.000000e+00, align 4 +@e = global i32 0, align 4 +@c = global float* null, align 4 + +; The FP stack should be preserved across the call to __alloca. +define void @h(float %i) { +; CHECK-LABEL: h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: andl $-16, %esp +; CHECK-NEXT: subl $32, %esp +; CHECK-NEXT: movl %esp, %esi +; CHECK-NEXT: .cfi_offset %esi, -12 +; CHECK-NEXT: flds 8(%ebp) +; CHECK-NEXT: movl _a, %ecx +; CHECK-NEXT: leal 3(%ecx), %eax +; CHECK-NEXT: andl $-4, %eax +; CHECK-NEXT: calll __alloca +; CHECK-NEXT: movl %esp, %eax +; CHECK-NEXT: andl $-16, %eax +; CHECK-NEXT: movl %eax, %esp +; CHECK-NEXT: fsts 8(%esi) # 4-byte Folded Spill +; CHECK-NEXT: fadds _b +; CHECK-NEXT: fsts _d +; CHECK-NEXT: fld1 +; CHECK-NEXT: fldz +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: fld %st(0) +; CHECK-NEXT: fld %st(2) +; CHECK-NEXT: je LBB0_2 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: fstp %st(1) +; CHECK-NEXT: fstp %st(0) +; CHECK-NEXT: movl _f, %ecx +; CHECK-NEXT: flds (%eax,%ecx,4) +; CHECK-NEXT: fld %st(3) +; CHECK-NEXT: LBB0_2: # %for.cond1.preheader +; CHECK-NEXT: movl _e, %ecx +; CHECK-NEXT: movl %ecx, 12(%esi) +; CHECK-NEXT: fildl 12(%esi) +; CHECK-NEXT: movl _c, %edx +; CHECK-NEXT: jmp LBB0_3 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: LBB0_5: # %for.inc +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: fxch %st(5) +; CHECK-NEXT: fadd %st(4), %st +; CHECK-NEXT: fxch %st(5) +; CHECK-NEXT: LBB0_3: # %for.cond1 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: fld %st(5) +; CHECK-NEXT: fmul %st(4), %st +; CHECK-NEXT: fdiv %st(2), %st +; CHECK-NEXT: fadd %st(3), %st +; CHECK-NEXT: fsts _g +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: fucom %st(1) +; CHECK-NEXT: fstp %st(1) +; CHECK-NEXT: fnstsw %ax +; CHECK-NEXT: # kill: def $ah killed $ah killed $ax +; CHECK-NEXT: sahf +; CHECK-NEXT: jbe LBB0_5 +; CHECK-NEXT: # %bb.4: # %if.then +; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: flds 8(%esi) # 4-byte Folded Reload +; CHECK-NEXT: fstps (%edx,%ecx,4) +; CHECK-NEXT: jmp LBB0_5 +entry: + %0 = load i32, i32* @a, align 4 + %1 = alloca i8, i32 %0, align 16 + %2 = load float, float* @b, align 4 + %add = fadd float %2, %i + store float %add, float* @d, align 4 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %for.cond1.preheader, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %3 = bitcast i8* %1 to float* + %4 = load i32, i32* @f, align 4 + %arrayidx.le = getelementptr inbounds float, float* %3, i32 %4 + %5 = load float, float* %arrayidx.le, align 4 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.body.preheader, %entry + %k.0.lcssa = phi float [ %5, %for.body.preheader ], [ undef, %entry ] + %l.0.lcssa = phi float [ %add, %for.body.preheader ], [ 1.000000e+00, %entry ] + %6 = load i32, i32* @e, align 4 + %conv = sitofp i32 %6 to float + %7 = load float*, float** @c, align 4 + %arrayidx4 = getelementptr inbounds float, float* %7, i32 %6 + br label %for.cond1 + +for.cond1: ; preds = %for.inc, %for.cond1.preheader + %m.0 = phi float [ %add5, %for.inc ], [ %add, %for.cond1.preheader ] + %mul = fmul float %m.0, 0.000000e+00 + %div = fdiv float %mul, %l.0.lcssa + %add2 = fadd float %k.0.lcssa, %div + store float %add2, float* @g, align 4 + %cmp = fcmp olt float %add2, %conv + br i1 %cmp, label %if.then, label %for.inc + +if.then: ; preds = %for.cond1 + store float %i, float* %arrayidx4, align 4 + br label %for.inc + +for.inc: ; preds = %if.then, %for.cond1 + %add5 = fadd float %m.0, 1.000000e+00 + br label %for.cond1 +}