diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4488,6 +4488,12 @@ return nullptr; } + /// Returns a 0 terminated array of rounding control registers that can be + /// attached into strict FP call. + virtual const MCPhysReg *getRoundingControlRegisters() const { + return nullptr; + } + /// This callback is used to prepare for a volatile or atomic load. /// It takes a chain node as input and returns the chain for the load itself. /// diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -1161,6 +1161,14 @@ } } + // Add rounding control registers as implicit def for function call. + if (II.isCall() && MF->getFunction().hasFnAttribute(Attribute::StrictFP)) { + const MCPhysReg *RCRegs = TLI->getRoundingControlRegisters(); + if (RCRegs) + for (; *RCRegs; ++RCRegs) + UsedRegs.push_back(*RCRegs); + } + // Finally mark unused registers as dead. if (!UsedRegs.empty() || !II.implicit_defs().empty() || II.hasOptionalDef()) MIB->setPhysRegsDeadExcept(UsedRegs, *TRI); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1702,6 +1702,7 @@ LLVMContext &Context) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + const MCPhysReg *getRoundingControlRegisters() const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3111,6 +3111,13 @@ return ScratchRegs; } +const MCPhysReg *X86TargetLowering::getRoundingControlRegisters() const { + // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit + // tests at the moment, which is not what we expected. + static const MCPhysReg RCRegs[] = { X86::MXCSR, 0 }; + return RCRegs; +} + /// Lowers masks values (v*i1) to the local register values /// \returns DAG node after lowering to register type static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll @@ -380,10 +380,12 @@ ; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: testq %rdi, %rdi ; SSE2-NEXT: cmovnsq %rdi, %rcx -; SSE2-NEXT: cvtsi2ss %rcx, %xmm0 -; SSE2-NEXT: jns .LBB9_2 +; SSE2-NEXT: cvtsi2ss %rcx, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm1, %xmm0 +; SSE2-NEXT: js .LBB9_2 ; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: .LBB9_2: ; SSE2-NEXT: pushq %rax ; SSE2-NEXT: callq __truncsfhf2@PLT diff --git a/llvm/test/CodeGen/X86/pr59305.ll b/llvm/test/CodeGen/X86/pr59305.ll --- a/llvm/test/CodeGen/X86/pr59305.ll +++ b/llvm/test/CodeGen/X86/pr59305.ll @@ -4,23 +4,28 @@ define double @foo(double %0) #0 { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rax +; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; CHECK-NEXT: movl $1024, %edi # imm = 0x400 ; CHECK-NEXT: callq fesetround@PLT -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: divsd (%rsp), %xmm0 # 8-byte Folded Reload -; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: divsd (%rsp), %xmm1 # 8-byte Folded Reload +; CHECK-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movl $1024, %edi # imm = 0x400 ; CHECK-NEXT: callq fesetround@PLT +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: divsd (%rsp), %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movl $1024, %edi # imm = 0x400 ; CHECK-NEXT: callq fesetround@PLT -; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: divsd (%rsp), %xmm2 # 8-byte Folded Reload +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: callq fma@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: retq %2 = call i32 @fesetround(i32 noundef 1024) %3 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0