Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -678,6 +678,10 @@ /// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3> SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV); + /// Convert Op, which must be of float type, to the + /// float type VT, by either extending or rounding (by truncation). + SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT); + /// Convert Op, which must be of integer type, to the /// integer type VT, by either any-extending or truncating it. SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT); Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -526,10 +526,11 @@ /// If the input is a vector that needs to be scalarized, it must be <1 x ty>, /// so just return the element, ignoring the index. SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) { + EVT VT = N->getValueType(0); SDValue Res = GetScalarizedVector(N->getOperand(0)); - if (Res.getValueType() != N->getValueType(0)) - Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), - Res); + if (Res.getValueType() != VT) + Res = VT.isInteger() ? DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res) + : DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res); return Res; } Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -984,6 +984,12 @@ DbgInfo->clear(); } +SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) { + return VT.bitsGT(Op.getValueType()) + ? getNode(ISD::FP_EXTEND, DL, VT, Op) + : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL)); +} + SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) { return VT.bitsGT(Op.getValueType()) ? getNode(ISD::ANY_EXTEND, DL, VT, Op) : Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -362,9 +362,11 @@ return DAG.getUNDEF(ValueVT); } - if (ValueVT.getVectorNumElements() == 1 && - ValueVT.getVectorElementType() != PartEVT) - Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType()); + EVT ValueSVT = ValueVT.getVectorElementType(); + if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) { + Val = ValueVT.isInteger() ? DAG.getAnyExtOrTrunc(Val, DL, ValueSVT) + : DAG.getFPExtendOrRound(Val, DL, ValueSVT); + } return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val); } @@ -558,7 +560,8 @@ ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); - Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT); + Val = PartVT.isInteger() ? DAG.getAnyExtOrTrunc(Val, DL, PartVT) + : DAG.getFPExtendOrRound(Val, DL, PartVT); } Parts[0] = Val; Index: test/CodeGen/X86/pr31088.ll =================================================================== --- test/CodeGen/X86/pr31088.ll +++ test/CodeGen/X86/pr31088.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c | FileCheck %s --check-prefix=F16C + +define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind { +; SSE-LABEL: ir_fadd_v1f16: +; SSE: # BB#0: +; SSE-NEXT: pushq %rax +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: popq %rax +; SSE-NEXT: retq +; +; AVX-LABEL: ir_fadd_v1f16: +; AVX: # BB#0: +; AVX-NEXT: pushq %rax +; AVX-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: movzwl %ax, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX-NEXT: vmovss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: movzwl %ax, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vaddss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX-NEXT: popq %rax +; AVX-NEXT: retq +; +; F16C-LABEL: ir_fadd_v1f16: +; F16C: # BB#0: +; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; F16C-NEXT: retq + %retval = fadd <1 x half> %arg0, %arg1 + ret <1 x half> %retval +} + +define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { +; SSE-LABEL: ir_fadd_v2f16: +; SSE: # BB#0: +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movss %xmm2, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __gnu_f2h_ieee +; SSE-NEXT: movzwl %ax, %edi +; SSE-NEXT: callq __gnu_h2f_ieee +; SSE-NEXT: addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: addss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload +; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: retq +; +; AVX-LABEL: ir_fadd_v2f16: +; AVX: # BB#0: +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: vmovss %xmm2, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX-NEXT: vmovss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX-NEXT: vmovaps %xmm3, %xmm0 +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: movzwl %ax, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX-NEXT: vmovss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: movzwl %ax, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX-NEXT: vmovss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: movzwl %ax, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX-NEXT: vmovss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: callq __gnu_f2h_ieee +; AVX-NEXT: movzwl %ax, %edi +; AVX-NEXT: callq __gnu_h2f_ieee +; AVX-NEXT: vaddss {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX-NEXT: vmovss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload +; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vaddss {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 4-byte Folded Reload +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: retq +; +; F16C-LABEL: ir_fadd_v2f16: +; F16C: # BB#0: +; F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; F16C-NEXT: vcvtph2ps %xmm3, %xmm3 +; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 +; F16C-NEXT: retq + %retval = fadd <2 x half> %arg0, %arg1 + ret <2 x half> %retval +} +