Index: lib/Target/X86/X86InstrFPStack.td =================================================================== --- lib/Target/X86/X86InstrFPStack.td +++ lib/Target/X86/X86InstrFPStack.td @@ -229,7 +229,7 @@ } // mayLoad = 1, hasSideEffects = 1 } -let Defs = [FPSW] in { +let Defs = [FPSW], Uses = [FPCW] in { // FPBinary_rr just defines pseudo-instructions, no need to set a scheduling // resources. let hasNoSchedulingInfo = 1 in { @@ -266,7 +266,7 @@ // NOTE: GAS and apparently all other AT&T style assemblers have a broken notion // of some of the 'reverse' forms of the fsub and fdiv instructions. As such, // we have to put some 'r's in and take them out of weird places. -let SchedRW = [WriteFAdd], Defs = [FPSW] in { +let SchedRW = [WriteFAdd], Defs = [FPSW], Uses = [FPCW] in { def ADD_FST0r : FPST0rInst ; def ADD_FrST0 : FPrST0Inst ; def ADD_FPrST0 : FPrST0PInst; @@ -281,12 +281,12 @@ def COM_FST0r : FPST0rInst ; def COMP_FST0r : FPST0rInst ; } // SchedRW -let SchedRW = [WriteFMul], Defs = [FPSW] in { +let SchedRW = [WriteFMul], Defs = [FPSW], Uses = [FPCW] in { def MUL_FST0r : FPST0rInst ; def MUL_FrST0 : FPrST0Inst ; def MUL_FPrST0 : FPrST0PInst; } // SchedRW -let SchedRW = [WriteFDiv], Defs = [FPSW] in { +let SchedRW = [WriteFDiv], Defs = [FPSW], Uses = [FPCW] in { def DIVR_FST0r : FPST0rInst ; def DIV_FrST0 : FPrST0Inst ; def DIV_FPrST0 : FPrST0PInst; @@ -453,7 +453,7 @@ [(set RFP80:$dst, (X86fild addr:$src, i64))]>; } // SchedRW -let SchedRW = [WriteStore] in { +let SchedRW = [WriteStore], Uses = [FPCW] in { def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, [(store RFP32:$src, addr:$op)]>; def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, @@ -488,7 +488,7 @@ def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; } // mayStore -} // SchedRW +} // SchedRW, Uses = [FPCW] let mayLoad = 1, SchedRW = [WriteLoad] in { def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">; @@ -498,7 +498,7 @@ def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">; def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">; } -let mayStore = 1, SchedRW = [WriteStore] in { +let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in { def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">; def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">; def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">; @@ -512,7 +512,7 @@ } // FISTTP requires SSE3 even though it's a FPStack op. -let Predicates = [HasSSE3], SchedRW = [WriteStore] in { +let Predicates = [HasSSE3], SchedRW = [WriteStore], Uses = [FPCW] in { def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, [(X86fp_to_i16mem RFP32:$src, addr:$op)]>; def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, @@ -533,7 +533,7 @@ [(X86fp_to_i64mem RFP80:$src, addr:$op)]>; } // Predicates = [HasSSE3] -let mayStore = 1, SchedRW = [WriteStore] in { +let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in { def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">; def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">; def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">; @@ -632,12 +632,12 @@ def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags (outs), (ins), "fnstsw\t{%ax|ax}", [(set AX, (X86fp_stsw FPSW))]>; -let Defs = [FPSW] in +let Defs = [FPSW], Uses = [FPCW] in def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world (outs), (ins i16mem:$dst), "fnstcw\t$dst", [(X86fp_cwd_get16 addr:$dst)]>; } // SchedRW -let Defs = [FPSW], mayLoad = 1 in +let Defs = [FPSW,FPCW], mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] (outs), (ins i16mem:$dst), "fldcw\t$dst", []>, Sched<[WriteLoad]>; Index: lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- lib/Target/X86/X86RegisterInfo.cpp +++ lib/Target/X86/X86RegisterInfo.cpp @@ -496,6 +496,9 @@ BitVector Reserved(getNumRegs()); const X86FrameLowering *TFI = getFrameLowering(MF); + // Set the floating point control register as reserved. + Reserved.set(X86::FPCW); + // Set the stack-pointer register and its aliases as reserved. for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); ++I) Index: lib/Target/X86/X86RegisterInfo.td =================================================================== --- lib/Target/X86/X86RegisterInfo.td +++ lib/Target/X86/X86RegisterInfo.td @@ -289,6 +289,9 @@ // Floating-point status word def FPSW : X86Reg<"fpsw", 0>; +// Floating-point control word +def FPCW : X86Reg<"fpcr", 0>; + // Status flags register. // // Note that some flags that are commonly thought of as part of the status Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3967,9 +3967,27 @@ if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { Formula F = Base; - if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy); - for (const SCEV *&BaseReg : F.BaseRegs) - BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy); + // Sometimes SCEV is able to prove zero during ext transform. It may + // happen if SCEV did not do all possible transforms while creating the + // initial node (maybe due to depth limitations), but it can do them while + // taking ext. + if (F.ScaledReg) { + const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy); + if (NewScaledReg->isZero()) + continue; + F.ScaledReg = NewScaledReg; + } + bool HasZeroBaseReg = false; + for (const SCEV *&BaseReg : F.BaseRegs) { + const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy); + if (NewBaseReg->isZero()) { + HasZeroBaseReg = true; + break; + } + BaseReg = NewBaseReg; + } + if (HasZeroBaseReg) + continue; // TODO: This assumes we've done basic processing on all uses and // have an idea what the register usage is. Index: test/CodeGen/MIR/X86/memory-operands.mir =================================================================== --- test/CodeGen/MIR/X86/memory-operands.mir +++ test/CodeGen/MIR/X86/memory-operands.mir @@ -359,8 +359,8 @@ CFI_INSTRUCTION def_cfa_offset 32 LD_F80m $rsp, 1, $noreg, 32, $noreg, implicit-def dead $fpsw ; CHECK: name: stack_psv - ; CHECK: ST_FP80m $rsp, 1, $noreg, 0, $noreg, implicit-def dead $fpsw :: (store 10 into stack, align 16) - ST_FP80m $rsp, 1, _, 0, _, implicit-def dead $fpsw :: (store 10 into stack, align 16) + ; CHECK: ST_FP80m $rsp, 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (store 10 into stack, align 16) + ST_FP80m $rsp, 1, _, 0, _, implicit-def dead $fpsw, implicit $fpcw :: (store 10 into stack, align 16) CALL64pcrel32 &cosl, csr_64, implicit $rsp, implicit-def $rsp, implicit-def $fp0 $rsp = ADD64ri8 $rsp, 24, implicit-def dead $eflags RETQ Index: test/CodeGen/X86/ipra-reg-usage.ll =================================================================== --- test/CodeGen/X86/ipra-reg-usage.ll +++ test/CodeGen/X86/ipra-reg-usage.ll @@ -3,7 +3,7 @@ target triple = "x86_64-unknown-unknown" declare void @bar1() define preserve_allcc void @foo()#0 { -; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $fpsw $fs $gs $hip $ip $rip $riz $ss $ssp $bnd0 $bnd1 $bnd2 $bnd3 $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh +; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $fpcw $fpsw $fs $gs $hip $ip $rip $riz $ss $ssp $bnd0 $bnd1 $bnd2 $bnd3 $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh call void @bar1() call void @bar2() ret void Index: test/CodeGen/X86/pr34080.ll =================================================================== --- test/CodeGen/X86/pr34080.ll +++ test/CodeGen/X86/pr34080.ll @@ -65,12 +65,12 @@ ; SSE2-SCHEDULE-NEXT: movsd %xmm0, -64(%rbp) ; SSE2-SCHEDULE-NEXT: movsd %xmm0, -32(%rbp) ; SSE2-SCHEDULE-NEXT: fsubl -32(%rbp) -; SSE2-SCHEDULE-NEXT: fnstcw -2(%rbp) ; SSE2-SCHEDULE-NEXT: flds {{.*}}(%rip) +; SSE2-SCHEDULE-NEXT: fnstcw -2(%rbp) +; SSE2-SCHEDULE-NEXT: fmul %st, %st(1) ; SSE2-SCHEDULE-NEXT: movzwl -2(%rbp), %eax ; SSE2-SCHEDULE-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F ; SSE2-SCHEDULE-NEXT: fldcw -2(%rbp) -; SSE2-SCHEDULE-NEXT: fmul %st, %st(1) ; SSE2-SCHEDULE-NEXT: movw %ax, -2(%rbp) ; SSE2-SCHEDULE-NEXT: fxch %st(1) ; SSE2-SCHEDULE-NEXT: fistl -12(%rbp) Index: test/CodeGen/X86/pr40529.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/pr40529.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s + +define x86_fp80 @rem_pio2l_min(x86_fp80 %z) { +; CHECK-LABEL: rem_pio2l_min: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F +; CHECK-NEXT: fldcw -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fistl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldcw -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fisubl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: flds {{.*}}(%rip) +; CHECK-NEXT: fmul %st, %st(1) +; CHECK-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F +; CHECK-NEXT: fldcw -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: fistl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fldcw -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fisubl -{{[0-9]+}}(%rsp) +; CHECK-NEXT: fmulp %st, %st(1) +; CHECK-NEXT: retq +entry: + %conv = fptosi x86_fp80 %z to i32 + %conv1 = sitofp i32 %conv to x86_fp80 + %sub = fsub x86_fp80 %z, %conv1 + %mul = fmul x86_fp80 %sub, 0xK40178000000000000000 + %conv2 = fptosi x86_fp80 %mul to i32 + %conv3 = sitofp i32 %conv2 to x86_fp80 + %sub4 = fsub x86_fp80 %mul, %conv3 + %mul5 = fmul x86_fp80 %sub4, 0xK40178000000000000000 + ret x86_fp80 %mul5 +} Index: test/Transforms/LoopStrengthReduce/X86/pr40514.ll =================================================================== --- /dev/null +++ test/Transforms/LoopStrengthReduce/X86/pr40514.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-reduce -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @pluto(i32 %arg) #0 { +; CHECK-LABEL: @pluto( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB10:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i64 [[LSR_IV_NEXT2:%.*]], i64 addrspace(1)* undef, align 8 +; CHECK-NEXT: ret i32 [[LSR_IV_NEXT:%.*]] +; CHECK: bb10: +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2]], [[BB10]] ], [ 9, [[BB:%.*]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT]], [[BB10]] ], [ undef, [[BB]] ] +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 1 +; CHECK-NEXT: [[LSR_IV_NEXT2]] = add nuw nsw i64 [[LSR_IV1]], 1 +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB10]] +; + +bb: + br label %bb10 + +bb1: ; preds = %bb10 + %tmp = and i64 %tmp24, 4294967295 + %tmp2 = shl i64 %tmp23, 33 + %tmp3 = ashr exact i64 %tmp2, 32 + %tmp4 = add i64 undef, %tmp + %tmp5 = add i64 %tmp4, %tmp3 + %tmp6 = add i64 %tmp5, undef + %tmp7 = add i64 %tmp6, undef + %tmp8 = add i64 undef, %tmp7 + store i64 %tmp8, i64 addrspace(1)* undef, align 8 + %tmp9 = trunc i64 %tmp7 to i32 + ret i32 %tmp9 + +bb10: ; preds = %bb10, %bb + %tmp11 = phi i64 [ 9, %bb ], [ %tmp24, %bb10 ] + %tmp12 = shl i64 undef, 1 + %tmp13 = mul i64 %tmp12, %tmp12 + %tmp14 = shl i64 %tmp13, 1 + %tmp15 = mul i64 %tmp14, %tmp14 + %tmp16 = shl i64 %tmp15, 1 + %tmp17 = mul i64 %tmp16, %tmp16 + %tmp18 = shl i64 %tmp17, 1 + %tmp19 = mul i64 %tmp18, %tmp18 + %tmp20 = shl i64 %tmp19, 1 + %tmp21 = mul i64 %tmp20, %tmp20 + %tmp22 = shl i64 %tmp21, 1 + %tmp23 = mul i64 %tmp22, %tmp22 + %tmp24 = add nuw nsw i64 %tmp11, 1 + br i1 undef, label %bb1, label %bb10 +} + + +attributes #0 = { "target-cpu"="broadwell" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }