Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16265,15 +16265,21 @@ const bool NIsTrunc = N->getConstantOperandVal(1) == 1; const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1; - // Skip this folding if it results in an fp_round from f80 to f16. + // Skip this folding if it results in an fp_round from f80 or f64 to f16. // - // f80 to f16 always generates an expensive (and as yet, unimplemented) - // libcall to __truncxfhf2 instead of selecting native f16 conversion - // instructions from f32 or f64. Moreover, the first (value-preserving) - // fp_round from f80 to either f32 or f64 may become a NOP in platforms like - // x86. - if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16) - return SDValue(); + // f80 (or f64) to f16 always generate an expensive (and as yet, + // unimplemented on most targets) libcall to __truncxfhf2 (or to + // __truncdfhf2) instead of selecting native f16 conversion instructions + // from f32. Suppress this folding in both the scalar and vector forms. + bool TargIsFp16 = VT == MVT::f16 || + (VT.isVector() && VT.getVectorElementType() == MVT::f16); + if (TargIsFp16) { + EVT SrcVT = N0.getOperand(0).getValueType(); + if (SrcVT.isVector()) + SrcVT = SrcVT.getVectorElementType(); + if (SrcVT == MVT::f64 || SrcVT == MVT::f80) + return SDValue(); + } // If the first fp_round isn't a value preserving truncation, it might // introduce a tie in the second fp_round, that wouldn't occur in the Index: llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll =================================================================== --- llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll +++ llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL --check-prefix=F16C +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX define zeroext i16 @test1_fast(double %d) #0 { ; ALL-LABEL: test1_fast: @@ -62,6 +62,233 @@ ret i16 %0 } +; The 'test3*' versions convert: +; 'double' -> 'float' -> 'half' +; With 'unsafe-fp-math', it is legal to fold this to converting directly +; from 'double' -> 'half'. But that is generally less efficient (done +; via slow run-time calls) than the two-step unfolded approach. So +; verify that we generate the same code (without the folding) for +; the plain and fast versions. +define half @test3_fast(double %d) #0 { +; F16C-LABEL: test3_fast: +; F16C: # %bb.0: +; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX-LABEL: test3_fast: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rax +; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: popq %rax +; AVX-NEXT: retq + %1 = fptrunc double %d to float + %2 = fptrunc float %1 to half + ret half %2 +} + +define half @test3(double %d) #1 { +; F16C-LABEL: test3: +; F16C: # %bb.0: +; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX-LABEL: test3: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rax +; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: popq %rax +; AVX-NEXT: retq + %1 = fptrunc double %d to float + %2 = fptrunc float %1 to half + ret half %2 +} + +define <2 x half> @test3_vector_fast(<2 x double> %d) #0 { +; F16C-LABEL: test3_vector_fast: +; F16C: # %bb.0: +; F16C-NEXT: vcvtpd2ps %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX-LABEL: test3_vector_fast: +; AVX: # %bb.0: +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq + %1 = fptrunc <2 x double> %d to <2 x float> + %2 = fptrunc <2 x float> %1 to <2 x half> + ret <2 x half> %2 +} + +define <2 x half> @test3_vector(<2 x double> %d) #1 { +; F16C-LABEL: test3_vector: +; F16C: # %bb.0: +; F16C-NEXT: vcvtpd2ps %xmm0, %xmm0 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX-LABEL: test3_vector: +; AVX: # %bb.0: +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq + %1 = fptrunc <2 x double> %d to <2 x float> + %2 = fptrunc <2 x float> %1 to <2 x half> + ret <2 x half> %2 +} + +; The 'test4*' versions convert: +; x86_fp80 -> float -> half +; Like the 'test3*' versions, we don't want to fold, even when +; 'unsafe-fp-math' is true. +define half @test4_fast(x86_fp80 %d) #0 { +; F16C-LABEL: test4_fast: +; F16C: # %bb.0: +; F16C-NEXT: fldt {{[0-9]+}}(%rsp) +; F16C-NEXT: fstps -{{[0-9]+}}(%rsp) +; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX-LABEL: test4_fast: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rax +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fstps {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: popq %rax +; AVX-NEXT: retq + %1 = fptrunc x86_fp80 %d to float + %2 = fptrunc float %1 to half + ret half %2 +} + +define half @test4(x86_fp80 %d) #1 { +; F16C-LABEL: test4: +; F16C: # %bb.0: +; F16C-NEXT: fldt {{[0-9]+}}(%rsp) +; F16C-NEXT: fstps -{{[0-9]+}}(%rsp) +; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX-LABEL: test4: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rax +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fstps {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: popq %rax +; AVX-NEXT: retq + %1 = fptrunc x86_fp80 %d to float + %2 = fptrunc float %1 to half + ret half %2 +} + +define <2 x half> @test4_vector_fast(<2 x x86_fp80> %d) #0 { +; F16C-LABEL: test4_vector_fast: +; F16C: # %bb.0: +; F16C-NEXT: fldt {{[0-9]+}}(%rsp) +; F16C-NEXT: fldt {{[0-9]+}}(%rsp) +; F16C-NEXT: fstps -{{[0-9]+}}(%rsp) +; F16C-NEXT: fstps -{{[0-9]+}}(%rsp) +; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],zero,zero +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX-LABEL: test4_vector_fast: +; AVX: # %bb.0: +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fstps {{[0-9]+}}(%rsp) +; AVX-NEXT: fstps {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq + %1 = fptrunc <2 x x86_fp80> %d to <2 x float> + %2 = fptrunc <2 x float> %1 to <2 x half> + ret <2 x half> %2 +} + +define <2 x half> @test4_vector(<2 x x86_fp80> %d) #1 { +; F16C-LABEL: test4_vector: +; F16C: # %bb.0: +; F16C-NEXT: fldt {{[0-9]+}}(%rsp) +; F16C-NEXT: fldt {{[0-9]+}}(%rsp) +; F16C-NEXT: fstps -{{[0-9]+}}(%rsp) +; F16C-NEXT: fstps -{{[0-9]+}}(%rsp) +; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],zero,zero +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX-LABEL: test4_vector: +; AVX: # %bb.0: +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX-NEXT: fstps {{[0-9]+}}(%rsp) +; AVX-NEXT: fstps {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq + %1 = fptrunc <2 x x86_fp80> %d to <2 x float> + %2 = fptrunc <2 x float> %1 to <2 x half> + ret <2 x half> %2 +} + declare i16 @llvm.convert.to.fp16.f64(double) declare i16 @llvm.convert.to.fp16.f80(x86_fp80)