Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3953,6 +3953,16 @@ VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts); } + // If the widened op is not legal and the scalar variants + // will be turned into libcalls, unroll the op to prevent + // widening so that we don't end up with libcalls on + // undef elements. + if (!TLI.isOperationLegal(Opcode, VT) && + TLI.getOperationAction(Opcode, VT.getScalarType()) == + TargetLowering::LibCall) { + return DAG.UnrollVectorOp(N, VT.getVectorNumElements()); + } + if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) { // Operation doesn't trap so just widen as normal. SDValue InOp1 = GetWidenedVector(N->getOperand(0)); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -354,8 +354,8 @@ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); - setOperationAction(ISD::FREM , MVT::f32 , Expand); - setOperationAction(ISD::FREM , MVT::f64 , Expand); + setOperationAction(ISD::FREM , MVT::f32 , LibCall); + setOperationAction(ISD::FREM , MVT::f64 , LibCall); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FREM , MVT::f128 , Expand); @@ -2195,19 +2195,24 @@ // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && - (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) - for (ISD::NodeType Op : + (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) { + for (ISD::NodeType Op : {ISD::FCEIL, ISD::STRICT_FCEIL, ISD::FCOS, ISD::STRICT_FCOS, ISD::FEXP, ISD::STRICT_FEXP, ISD::FFLOOR, ISD::STRICT_FFLOOR, - ISD::FREM, ISD::STRICT_FREM, ISD::FLOG, ISD::STRICT_FLOG, ISD::FLOG10, ISD::STRICT_FLOG10, ISD::FPOW, ISD::STRICT_FPOW, ISD::FSIN, ISD::STRICT_FSIN}) - if (isOperationExpand(Op, MVT::f32)) - setOperationAction(Op, MVT::f32, Promote); + if (isOperationExpand(Op, MVT::f32)) + setOperationAction(Op, MVT::f32, Promote); + + // The default action for f32 FREMs on X86 is LibCall + // so these need to be explicitly changed to Promote. + setOperationAction(ISD::FREM, MVT::f32, Promote); + setOperationAction(ISD::STRICT_FREM, MVT::f32, Promote); + } // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, Index: llvm/test/Analysis/CostModel/X86/arith-fp.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/arith-fp.ll +++ llvm/test/Analysis/CostModel/X86/arith-fp.ll @@ -628,80 +628,80 @@ define i32 @frem(i32 %arg) { ; SSE1-LABEL: 'frem' -; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = frem <8 x double> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = frem <2 x double> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = frem <4 x double> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8F64 = frem <8 x double> undef, undef ; SSE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE2-LABEL: 'frem' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = frem <8 x double> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'frem' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = frem <8 x double> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'frem' -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8F32 = frem <8 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16F32 = frem <16 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4F64 = frem <4 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8F64 = frem <8 x double> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'frem' -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16F32 = frem <16 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = frem <8 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8F32 = frem <8 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V16F32 = frem <16 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4F64 = frem <4 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8F64 = frem <8 x double> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'frem' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = frem <8 x double> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'frem' -; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = frem <8 x double> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %F32 = frem float undef, undef Index: llvm/test/CodeGen/X86/frem-libcall.ll =================================================================== --- llvm/test/CodeGen/X86/frem-libcall.ll +++ llvm/test/CodeGen/X86/frem-libcall.ll @@ -8,42 +8,26 @@ ; CHECK-LABEL: frem: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $80, %rsp +; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: divps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: movlps %xmm1, (%rbx) -; CHECK-NEXT: addq $80, %rsp +; CHECK-NEXT: addq $64, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq %frem = frem <2 x float> %a0, %a1