Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.h =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -73,7 +73,6 @@ bool tryConstantFP16(SDNode *N); bool SelectSETP_F16X2(SDNode *N); bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); - bool tryBUILD_VECTOR(SDNode *N); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); Index: lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -88,10 +88,6 @@ if (tryEXTRACT_VECTOR_ELEMENT(N)) return; break; - case ISD::BUILD_VECTOR: - if (tryBUILD_VECTOR(N)) - return; - break; case NVPTXISD::SETP_F16X2: SelectSETP_F16X2(N); return; @@ -649,33 +645,6 @@ return true; } -// We can init constant f16x2 with a single .b32 move. Normally it -// would get lowered as two constant loads and vector-packing move. -// mov.b16 %h1, 0x4000; -// mov.b16 %h2, 0x3C00; -// mov.b32 %hh2, {%h2, %h1}; -// Instead we want just a constant move: -// mov.b32 %hh2, 0x40003C00 -// -// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 -// generates good SASS in both cases. - -bool NVPTXDAGToDAGISel::tryBUILD_VECTOR(SDNode *N) { - if (!(N->getValueType(0) == MVT::v2f16 && - isa(N->getOperand(0)) && - isa(N->getOperand(1)))) - return false; - - APInt E0 = - cast(N->getOperand(0))->getValueAPF().bitcastToAPInt(); - APInt E1 = - cast(N->getOperand(1))->getValueAPF().bitcastToAPInt(); - SDValue Const = CurDAG->getTargetConstant(E1.zext(32).shl(16) | E0.zext(32), - SDLoc(N), MVT::i32); - ReplaceUses(SDValue(N, 0), Const); - return true; -} - static unsigned int getCodeAddrSpace(MemSDNode *N) { const Value *Src = N->getMemOperand()->getValue(); Index: lib/Target/NVPTX/NVPTXISelLowering.h =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.h +++ lib/Target/NVPTX/NVPTXISelLowering.h @@ -550,6 +550,7 @@ const NVPTXSubtarget &STI; // cache the subtarget here SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.cpp +++ lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -362,7 +362,7 @@ // Conversion to/from FP16/FP16x2 is always legal. setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); @@ -1847,6 +1847,32 @@ return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } +// We can init constant f16x2 with a single .b32 move. Normally it +// would get lowered as two constant loads and vector-packing move. +// mov.b16 %h1, 0x4000; +// mov.b16 %h2, 0x3C00; +// mov.b32 %hh2, {%h2, %h1}; +// Instead we want just a constant move: +// mov.b32 %hh2, 0x40003C00 +// +// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 +// generates good SASS in both cases. +SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + //return Op; + if (!(Op->getValueType(0) == MVT::v2f16 && + isa(Op->getOperand(0)) && + isa(Op->getOperand(1)))) + return Op; + + APInt E0 = + cast(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); + APInt E1 = + cast(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); + SDValue Const = + DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); + return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); +} SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue Index = Op->getOperand(1); @@ -2002,6 +2028,7 @@ case ISD::INTRINSIC_W_CHAIN: return Op; case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return Op; case ISD::EXTRACT_VECTOR_ELT: Index: test/CodeGen/NVPTX/f16x2-instructions.ll =================================================================== --- test/CodeGen/NVPTX/f16x2-instructions.ll +++ test/CodeGen/NVPTX/f16x2-instructions.ll @@ -14,7 +14,9 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" ; CHECK-LABEL: test_ret_const( -; CHECK: st.param.b32 [func_retval0+0], 1073757184; +; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184; +; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; ; CHECK-NEXT: ret; define <2 x half> @test_ret_const() #0 { ret <2 x half> @@ -82,7 +84,9 @@ ; CHECK-LABEL: test_fadd_imm_0( ; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0]; ; -; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], 1073757184; +; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; +; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; +; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]]; ; ; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] ; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] @@ -103,7 +107,9 @@ ; CHECK-LABEL: test_fadd_imm_1( ; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0]; ; -; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], 1073757184; +; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; +; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; +; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]]; ; ; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] ; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] @@ -149,7 +155,9 @@ ; CHECK-LABEL: test_fneg( ; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0]; ; -; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], 0, [[A]]; +; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0; +; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]]; +; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]]; ; ; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] ; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]