Index: llvm/trunk/include/llvm/CodeGen/SelectionDAG.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/SelectionDAG.h +++ llvm/trunk/include/llvm/CodeGen/SelectionDAG.h @@ -688,6 +688,10 @@ /// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3> SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV); + /// Convert Op, which must be of float type, to the + /// float type VT, by either extending or rounding (by truncation). + SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT); + /// Convert Op, which must be of integer type, to the /// integer type VT, by either any-extending or truncating it. SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT); Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -523,16 +523,17 @@ return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops); } -/// If the input is a vector that needs to be scalarized, it must be <1 x ty>, -/// so just return the element, ignoring the index. -SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { - EVT VT = N->getValueType(0); - SDValue Res = GetScalarizedVector(N->getOperand(0)); - if (Res.getValueType() != VT) - Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res); - return Res; -} - +/// If the input is a vector that needs to be scalarized, it must be <1 x ty>, +/// so just return the element, ignoring the index. +SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { + EVT VT = N->getValueType(0); + SDValue Res = GetScalarizedVector(N->getOperand(0)); + if (Res.getValueType() != VT) + Res = VT.isFloatingPoint() + ? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res) + : DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res); + return Res; +} /// If the input condition is a vector that needs to be scalarized, it must be /// <1 x i1>, so just convert to a normal ISD::SELECT Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -959,6 +959,12 @@ DbgInfo->clear(); } +SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) { + return VT.bitsGT(Op.getValueType()) + ? getNode(ISD::FP_EXTEND, DL, VT, Op) + : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL)); +} + SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -350,7 +350,8 @@ EVT ValueSVT = ValueVT.getVectorElementType(); if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) - Val = DAG.getAnyExtOrTrunc(Val, DL, ValueSVT); + Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT) + : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT); return DAG.getBuildVector(ValueVT, DL, Val); } @@ -543,10 +544,9 @@ Val = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); - - Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT); } + assert(Val.getValueType() == PartVT && "Unexpected vector part value type"); Parts[0] = Val; return; } Index: llvm/trunk/test/CodeGen/NVPTX/f16-instructions.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/f16-instructions.ll +++ llvm/trunk/test/CodeGen/NVPTX/f16-instructions.ll @@ -36,6 +36,21 @@ ret half %r } +; CHECK-LABEL: test_fadd_v1f16( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_v1f16_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_v1f16_param_1]; +; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 { + %r = fadd <1 x half> %a, %b + ret <1 x half> %r +} + ; Check that we can lower fadd with immediate arguments. ; CHECK-LABEL: test_fadd_imm_0( ; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_0_param_0]; Index: llvm/trunk/test/CodeGen/X86/pr31088.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr31088.ll +++ llvm/trunk/test/CodeGen/X86/pr31088.ll @@ -3,6 +3,63 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c | FileCheck %s --check-prefix=F16C +define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind { +; X86-LABEL: ir_fadd_v1f16: +; X86: # BB#0: +; X86-NEXT: subl $28, %esp +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: addl $28, %esp +; X86-NEXT: retl +; +; X64-LABEL: ir_fadd_v1f16: +; X64: # BB#0: +; X64-NEXT: pushq %rax +; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: callq __gnu_f2h_ieee +; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: callq __gnu_h2f_ieee +; X64-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; X64-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; X64-NEXT: # xmm0 = mem[0],zero,zero,zero +; X64-NEXT: callq __gnu_f2h_ieee +; X64-NEXT: movzwl %ax, %edi +; X64-NEXT: callq __gnu_h2f_ieee +; X64-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload +; X64-NEXT: popq %rax +; X64-NEXT: retq +; +; F16C-LABEL: ir_fadd_v1f16: +; F16C: # BB#0: +; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; F16C-NEXT: retq + %retval = fadd <1 x half> %arg0, %arg1 + ret <1 x half> %retval +} + define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { ; X86-LABEL: ir_fadd_v2f16: ; X86: # BB#0: