Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -18144,32 +18144,221 @@ #undef MMA_VARIANTS_B1_XOR } +static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF, + const CallExpr *E) { + Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); + QualType ArgType = E->getArg(0)->getType(); + clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType); + llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType()); + return CGF.Builder.CreateCall( + CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), + {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())}); +} + +static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF, + const CallExpr *E) { + Value *Ptr = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Type *ElemTy = + CGF.ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType()); + return CGF.Builder.CreateCall( + CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), + {Ptr, CGF.EmitScalarExpr(E->getArg(1))}); +} + +static Value *MakeHalfType(unsigned BuiltinID, const CallExpr *E, + CodeGenFunction &CGF) { + auto &C = CGF.CGM.getContext(); + if (!(C.getLangOpts().NativeHalfType || + !C.getTargetInfo().useFP16ConversionIntrinsics())) { + CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getName(BuiltinID).str() + + " requires native half type support."); + return nullptr; + } + + Intrinsic::ID Intr; + switch (BuiltinID) { + default: + llvm_unreachable("Unknown builtin ID."); + case NVPTX::BI__nvvm_ldg_h: + case NVPTX::BI__nvvm_ldg_h2: + return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, CGF, E); + case NVPTX::BI__nvvm_ldu_h: + case NVPTX::BI__nvvm_ldu_h2: + return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, CGF, E); + case NVPTX::BI__nvvm_ex2_approx_f16: + Intr = Intrinsic::nvvm_ex2_approx_f16; + break; + case NVPTX::BI__nvvm_ex2_approx_f16x2: + Intr = Intrinsic::nvvm_ex2_approx_f16x2; + break; + case NVPTX::BI__nvvm_ff2f16x2_rn: + Intr = Intrinsic::nvvm_ff2f16x2_rn; + break; + case NVPTX::BI__nvvm_ff2f16x2_rn_relu: + Intr = Intrinsic::nvvm_ff2f16x2_rn_relu; + break; + case NVPTX::BI__nvvm_ff2f16x2_rz: + Intr = Intrinsic::nvvm_ff2f16x2_rz; + break; + case NVPTX::BI__nvvm_ff2f16x2_rz_relu: + Intr = Intrinsic::nvvm_ff2f16x2_rz_relu; + break; + case NVPTX::BI__nvvm_fma_rn_f16: + Intr = Intrinsic::nvvm_fma_rn_f16; + break; + case NVPTX::BI__nvvm_fma_rn_f16x2: + Intr = Intrinsic::nvvm_fma_rn_f16x2; + break; + case NVPTX::BI__nvvm_fma_rn_ftz_f16: + Intr = Intrinsic::nvvm_fma_rn_ftz_f16; + break; + case NVPTX::BI__nvvm_fma_rn_ftz_f16x2: + Intr = Intrinsic::nvvm_fma_rn_ftz_f16x2; + break; + case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16: + Intr = Intrinsic::nvvm_fma_rn_ftz_relu_f16; + break; + case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2: + Intr = Intrinsic::nvvm_fma_rn_ftz_relu_f16x2; + break; + case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16: + Intr = Intrinsic::nvvm_fma_rn_ftz_sat_f16; + break; + case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2: + Intr = Intrinsic::nvvm_fma_rn_ftz_sat_f16x2; + break; + case NVPTX::BI__nvvm_fma_rn_relu_f16: + Intr = Intrinsic::nvvm_fma_rn_relu_f16; + break; + case NVPTX::BI__nvvm_fma_rn_relu_f16x2: + Intr = Intrinsic::nvvm_fma_rn_relu_f16x2; + break; + case NVPTX::BI__nvvm_fma_rn_sat_f16: + Intr = Intrinsic::nvvm_fma_rn_sat_f16; + break; + case NVPTX::BI__nvvm_fma_rn_sat_f16x2: + Intr = Intrinsic::nvvm_fma_rn_sat_f16x2; + break; + case NVPTX::BI__nvvm_fmax_f16: + Intr = Intrinsic::nvvm_fmax_f16; + break; + case NVPTX::BI__nvvm_fmax_f16x2: + Intr = Intrinsic::nvvm_fmax_f16x2; + break; + case NVPTX::BI__nvvm_fmax_ftz_f16: + Intr = Intrinsic::nvvm_fmax_ftz_f16; + break; + case NVPTX::BI__nvvm_fmax_ftz_f16x2: + Intr = Intrinsic::nvvm_fmax_ftz_f16x2; + break; + case NVPTX::BI__nvvm_fmax_ftz_nan_f16: + Intr = Intrinsic::nvvm_fmax_ftz_nan_f16; + break; + case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2: + Intr = Intrinsic::nvvm_fmax_ftz_nan_f16x2; + break; + case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16: + Intr = Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16; + break; + case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2: + Intr = Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2; + break; + case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16: + Intr = Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16; + break; + case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2: + Intr = Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2; + break; + case NVPTX::BI__nvvm_fmax_nan_f16: + Intr = Intrinsic::nvvm_fmax_nan_f16; + break; + case NVPTX::BI__nvvm_fmax_nan_f16x2: + Intr = Intrinsic::nvvm_fmax_nan_f16x2; + break; + case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16: + Intr = Intrinsic::nvvm_fmax_nan_xorsign_abs_f16; + break; + case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2: + Intr = Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2; + break; + case NVPTX::BI__nvvm_fmax_xorsign_abs_f16: + Intr = Intrinsic::nvvm_fmax_xorsign_abs_f16; + break; + case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2: + Intr = Intrinsic::nvvm_fmax_xorsign_abs_f16x2; + break; + case NVPTX::BI__nvvm_fmin_f16: + Intr = Intrinsic::nvvm_fmin_f16; + break; + case NVPTX::BI__nvvm_fmin_f16x2: + Intr = Intrinsic::nvvm_fmin_f16x2; + break; + case NVPTX::BI__nvvm_fmin_ftz_f16: + Intr = Intrinsic::nvvm_fmin_ftz_f16; + break; + case NVPTX::BI__nvvm_fmin_ftz_f16x2: + Intr = Intrinsic::nvvm_fmin_ftz_f16x2; + break; + case NVPTX::BI__nvvm_fmin_ftz_nan_f16: + Intr = Intrinsic::nvvm_fmin_ftz_nan_f16; + break; + case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2: + Intr = Intrinsic::nvvm_fmin_ftz_nan_f16x2; + break; + case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16: + Intr = Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16; + break; + case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2: + Intr = Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2; + break; + case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16: + Intr = Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16; + break; + case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2: + Intr = Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2; + break; + case NVPTX::BI__nvvm_fmin_nan_f16: + Intr = Intrinsic::nvvm_fmin_nan_f16; + break; + case NVPTX::BI__nvvm_fmin_nan_f16x2: + Intr = Intrinsic::nvvm_fmin_nan_f16x2; + break; + case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16: + Intr = Intrinsic::nvvm_fmin_nan_xorsign_abs_f16; + break; + case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2: + Intr = Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2; + break; + case NVPTX::BI__nvvm_fmin_xorsign_abs_f16: + Intr = Intrinsic::nvvm_fmin_xorsign_abs_f16; + break; + case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2: + Intr = Intrinsic::nvvm_fmin_xorsign_abs_f16x2; + break; + } + + SmallVector Args; + auto *F = CGF.CGM.getIntrinsic(Intr); + auto *FTy = F->getFunctionType(); + unsigned ICEArguments = 0; + ASTContext::GetBuiltinTypeError Error; + C.GetBuiltinType(BuiltinID, Error, &ICEArguments); + assert(Error == ASTContext::GE_None && "Should not codegen an error"); + for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) { + assert((ICEArguments & (1 << i)) == 0); + auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i)); + auto *PTy = FTy->getParamType(i); + if (PTy != ArgValue->getType()) + ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy); + Args.push_back(ArgValue); + } + + return CGF.Builder.CreateCall(F, Args); +} } // namespace -Value * -CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { - auto HasHalfSupport = [&](unsigned BuiltinID) { - auto &Context = getContext(); - return Context.getLangOpts().NativeHalfType || - !Context.getTargetInfo().useFP16ConversionIntrinsics(); - }; - auto MakeLdgLdu = [&](unsigned IntrinsicID) { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - QualType ArgType = E->getArg(0)->getType(); - clang::CharUnits Align = CGM.getNaturalPointeeTypeAlignment(ArgType); - llvm::Type *ElemTy = ConvertTypeForMem(ArgType->getPointeeType()); - return Builder.CreateCall( - CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), - {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())}); - }; - auto MakeScopedAtomic = [&](unsigned IntrinsicID) { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - llvm::Type *ElemTy = - ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType()); - return Builder.CreateCall( - CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}), - {Ptr, EmitScalarExpr(E->getArg(1))}); - }; +Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { switch (BuiltinID) { case NVPTX::BI__nvvm_atom_add_gen_i: case NVPTX::BI__nvvm_atom_add_gen_l: @@ -18279,22 +18468,13 @@ // PTX Interoperability section 2.2: "For a vector with an even number of // elements, its alignment is set to number of elements times the alignment // of its member: n*alignof(t)." - return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i); - case NVPTX::BI__nvvm_ldg_h: - case NVPTX::BI__nvvm_ldg_h2: - if (!HasHalfSupport(BuiltinID)) { - CGM.Error(E->getExprLoc(), - getContext().BuiltinInfo.getName(BuiltinID).str() + - " requires native half type support."); - return nullptr; - } - [[fallthrough]]; + return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E); case NVPTX::BI__nvvm_ldg_f: case NVPTX::BI__nvvm_ldg_f2: case NVPTX::BI__nvvm_ldg_f4: case NVPTX::BI__nvvm_ldg_d: case NVPTX::BI__nvvm_ldg_d2: - return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f); + return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E); case NVPTX::BI__nvvm_ldu_c: case NVPTX::BI__nvvm_ldu_c2: @@ -18320,105 +18500,96 @@ case NVPTX::BI__nvvm_ldu_ul: case NVPTX::BI__nvvm_ldu_ull: case NVPTX::BI__nvvm_ldu_ull2: - return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i); - case NVPTX::BI__nvvm_ldu_h: - case NVPTX::BI__nvvm_ldu_h2: - if (!HasHalfSupport(BuiltinID)) { - CGM.Error(E->getExprLoc(), - getContext().BuiltinInfo.getName(BuiltinID).str() + - " requires native half type support."); - return nullptr; - } - [[fallthrough]]; + return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E); case NVPTX::BI__nvvm_ldu_f: case NVPTX::BI__nvvm_ldu_f2: case NVPTX::BI__nvvm_ldu_f4: case NVPTX::BI__nvvm_ldu_d: case NVPTX::BI__nvvm_ldu_d2: - return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f); + return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E); case NVPTX::BI__nvvm_atom_cta_add_gen_i: case NVPTX::BI__nvvm_atom_cta_add_gen_l: case NVPTX::BI__nvvm_atom_cta_add_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_add_gen_i: case NVPTX::BI__nvvm_atom_sys_add_gen_l: case NVPTX::BI__nvvm_atom_sys_add_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_add_gen_f: case NVPTX::BI__nvvm_atom_cta_add_gen_d: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_add_gen_f: case NVPTX::BI__nvvm_atom_sys_add_gen_d: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_xchg_gen_i: case NVPTX::BI__nvvm_atom_cta_xchg_gen_l: case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_xchg_gen_i: case NVPTX::BI__nvvm_atom_sys_xchg_gen_l: case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_max_gen_i: case NVPTX::BI__nvvm_atom_cta_max_gen_ui: case NVPTX::BI__nvvm_atom_cta_max_gen_l: case NVPTX::BI__nvvm_atom_cta_max_gen_ul: case NVPTX::BI__nvvm_atom_cta_max_gen_ll: case NVPTX::BI__nvvm_atom_cta_max_gen_ull: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_max_gen_i: case NVPTX::BI__nvvm_atom_sys_max_gen_ui: case NVPTX::BI__nvvm_atom_sys_max_gen_l: case NVPTX::BI__nvvm_atom_sys_max_gen_ul: case NVPTX::BI__nvvm_atom_sys_max_gen_ll: case NVPTX::BI__nvvm_atom_sys_max_gen_ull: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_min_gen_i: case NVPTX::BI__nvvm_atom_cta_min_gen_ui: case NVPTX::BI__nvvm_atom_cta_min_gen_l: case NVPTX::BI__nvvm_atom_cta_min_gen_ul: case NVPTX::BI__nvvm_atom_cta_min_gen_ll: case NVPTX::BI__nvvm_atom_cta_min_gen_ull: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_min_gen_i: case NVPTX::BI__nvvm_atom_sys_min_gen_ui: case NVPTX::BI__nvvm_atom_sys_min_gen_l: case NVPTX::BI__nvvm_atom_sys_min_gen_ul: case NVPTX::BI__nvvm_atom_sys_min_gen_ll: case NVPTX::BI__nvvm_atom_sys_min_gen_ull: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_inc_gen_ui: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_cta_dec_gen_ui: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_inc_gen_ui: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_sys_dec_gen_ui: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_and_gen_i: case NVPTX::BI__nvvm_atom_cta_and_gen_l: case NVPTX::BI__nvvm_atom_cta_and_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_and_gen_i: case NVPTX::BI__nvvm_atom_sys_and_gen_l: case NVPTX::BI__nvvm_atom_sys_and_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_or_gen_i: case NVPTX::BI__nvvm_atom_cta_or_gen_l: case NVPTX::BI__nvvm_atom_cta_or_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_or_gen_i: case NVPTX::BI__nvvm_atom_sys_or_gen_l: case NVPTX::BI__nvvm_atom_sys_or_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_xor_gen_i: case NVPTX::BI__nvvm_atom_cta_xor_gen_l: case NVPTX::BI__nvvm_atom_cta_xor_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta, *this, E); case NVPTX::BI__nvvm_atom_sys_xor_gen_i: case NVPTX::BI__nvvm_atom_sys_xor_gen_l: case NVPTX::BI__nvvm_atom_sys_xor_gen_ll: - return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys); + return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E); case NVPTX::BI__nvvm_atom_cta_cas_gen_i: case NVPTX::BI__nvvm_atom_cta_cas_gen_l: case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: { @@ -18683,6 +18854,63 @@ CharUnits::fromQuantity(4)); return Result; } + // The following builtins require half type support + case NVPTX::BI__nvvm_ex2_approx_f16: + case NVPTX::BI__nvvm_ex2_approx_f16x2: + case NVPTX::BI__nvvm_ff2f16x2_rn: + case NVPTX::BI__nvvm_ff2f16x2_rn_relu: + case NVPTX::BI__nvvm_ff2f16x2_rz: + case NVPTX::BI__nvvm_ff2f16x2_rz_relu: + case NVPTX::BI__nvvm_fma_rn_f16: + case NVPTX::BI__nvvm_fma_rn_f16x2: + case NVPTX::BI__nvvm_fma_rn_ftz_f16: + case NVPTX::BI__nvvm_fma_rn_ftz_f16x2: + case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16: + case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2: + case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16: + case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2: + case NVPTX::BI__nvvm_fma_rn_relu_f16: + case NVPTX::BI__nvvm_fma_rn_relu_f16x2: + case NVPTX::BI__nvvm_fma_rn_sat_f16: + case NVPTX::BI__nvvm_fma_rn_sat_f16x2: + case NVPTX::BI__nvvm_fmax_f16: + case NVPTX::BI__nvvm_fmax_f16x2: + case NVPTX::BI__nvvm_fmax_ftz_f16: + case NVPTX::BI__nvvm_fmax_ftz_f16x2: + case NVPTX::BI__nvvm_fmax_ftz_nan_f16: + case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2: + case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16: + case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2: + case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16: + case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2: + case NVPTX::BI__nvvm_fmax_nan_f16: + case NVPTX::BI__nvvm_fmax_nan_f16x2: + case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16: + case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2: + case NVPTX::BI__nvvm_fmax_xorsign_abs_f16: + case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2: + case NVPTX::BI__nvvm_fmin_f16: + case NVPTX::BI__nvvm_fmin_f16x2: + case NVPTX::BI__nvvm_fmin_ftz_f16: + case NVPTX::BI__nvvm_fmin_ftz_f16x2: + case NVPTX::BI__nvvm_fmin_ftz_nan_f16: + case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2: + case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16: + case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2: + case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16: + case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2: + case NVPTX::BI__nvvm_fmin_nan_f16: + case NVPTX::BI__nvvm_fmin_nan_f16x2: + case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16: + case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2: + case NVPTX::BI__nvvm_fmin_xorsign_abs_f16: + case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2: + case NVPTX::BI__nvvm_ldg_h: + case NVPTX::BI__nvvm_ldg_h2: + case NVPTX::BI__nvvm_ldu_h: + case NVPTX::BI__nvvm_ldu_h2: { + return MakeHalfType(BuiltinID, E, *this); + } default: return nullptr; } Index: clang/test/CodeGen/builtins-nvptx-native-half-type-err.c =================================================================== --- clang/test/CodeGen/builtins-nvptx-native-half-type-err.c +++ clang/test/CodeGen/builtins-nvptx-native-half-type-err.c @@ -1,21 +1,119 @@ // REQUIRES: nvptx-registered-target // // RUN: not %clang_cc1 -fsyntax-only -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \ -// RUN: sm_75 -target-feature +ptx70 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \ -// RUN: | FileCheck -check-prefix=CHECK-ERROR %s +// RUN: sm_86 -target-feature +ptx72 -fcuda-is-device -x cuda -emit-llvm -o - %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK_ERROR %s #define __device__ __attribute__((device)) typedef __fp16 __fp16v2 __attribute__((ext_vector_type(2))); -__device__ void nvvm_ldg_ldu_native_half_types(const void *p) { - __nvvm_ldg_h((const __fp16 *)p); - __nvvm_ldg_h2((const __fp16v2 *)p); +__device__ void nvvm_native_half_types(void *a, void*b, void*c, __fp16* out) { + __fp16v2 resv2 = {0, 0}; + *out += __nvvm_ex2_approx_f16(*(__fp16 *)a); + resv2 = __nvvm_ex2_approx_f16x2(*(__fp16v2*)a); - __nvvm_ldu_h((const __fp16 *)p); - __nvvm_ldu_h2((const __fp16v2 *)p); + *out += __nvvm_fma_rn_relu_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c); + *out += __nvvm_fma_rn_ftz_relu_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16 *)c); + resv2 += __nvvm_fma_rn_relu_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c); + resv2 += __nvvm_fma_rn_ftz_relu_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c); + *out += __nvvm_fma_rn_ftz_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c); + *out += __nvvm_fma_rn_sat_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c); + *out += __nvvm_fma_rn_ftz_sat_f16(*(__fp16*)a, *(__fp16*)b, *(__fp16*)c); + resv2 += __nvvm_fma_rn_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c); + resv2 += __nvvm_fma_rn_ftz_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c); + resv2 += __nvvm_fma_rn_sat_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c); + resv2 += __nvvm_fma_rn_ftz_sat_f16x2(*(__fp16v2*)a, *(__fp16v2*)b, *(__fp16v2*)c); + + *out += __nvvm_fmin_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmin_ftz_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmin_nan_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmin_ftz_nan_f16(*(__fp16*)a, *(__fp16*)b); + resv2 += __nvvm_fmin_f16x2(*(__fp16v2*)a , *(__fp16v2*)b); + resv2 += __nvvm_fmin_ftz_f16x2(*(__fp16v2*)a , *(__fp16v2*)b); + resv2 += __nvvm_fmin_nan_f16x2(*(__fp16v2*)a , *(__fp16v2*)b); + resv2 += __nvvm_fmin_ftz_nan_f16x2(*(__fp16v2*)a , *(__fp16v2*)b); + *out += __nvvm_fmin_xorsign_abs_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmin_ftz_xorsign_abs_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmin_nan_xorsign_abs_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmin_ftz_nan_xorsign_abs_f16(*(__fp16*)a, *(__fp16*)b); + resv2 += __nvvm_fmin_xorsign_abs_f16x2(*(__fp16v2*)a, *(__fp16v2*)b); + resv2 += __nvvm_fmin_ftz_xorsign_abs_f16x2(*(__fp16v2*)a, *(__fp16v2*)b); + resv2 += __nvvm_fmin_nan_xorsign_abs_f16x2(*(__fp16v2*)a, *(__fp16v2*)b); + resv2 += __nvvm_fmin_ftz_nan_xorsign_abs_f16x2(*(__fp16v2*)a, *(__fp16v2*)b); + + *out += __nvvm_fmax_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmax_ftz_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmax_nan_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmax_ftz_nan_f16(*(__fp16*)a, *(__fp16*)b); + resv2 += __nvvm_fmax_f16x2(*(__fp16v2*)a , *(__fp16v2*)b); + resv2 += __nvvm_fmax_ftz_f16x2(*(__fp16v2*)a , *(__fp16v2*)b); + resv2 += __nvvm_fmax_nan_f16x2(*(__fp16v2*)a , *(__fp16v2*)b); + resv2 += __nvvm_fmax_ftz_nan_f16x2(*(__fp16v2*)a , *(__fp16v2*)b); + *out += __nvvm_fmax_xorsign_abs_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmax_ftz_xorsign_abs_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmax_nan_xorsign_abs_f16(*(__fp16*)a, *(__fp16*)b); + *out += __nvvm_fmax_ftz_nan_xorsign_abs_f16(*(__fp16*)a, *(__fp16*)b); + resv2 += __nvvm_fmax_xorsign_abs_f16x2(*(__fp16v2*)a, *(__fp16v2*)b); + resv2 += __nvvm_fmax_ftz_xorsign_abs_f16x2(*(__fp16v2*)a, *(__fp16v2*)b); + resv2 += __nvvm_fmax_nan_xorsign_abs_f16x2(*(__fp16v2*)a, *(__fp16v2*)b); + resv2 += __nvvm_fmax_ftz_nan_xorsign_abs_f16x2(*(__fp16v2*)a, *(__fp16v2*)b); + + *out += __nvvm_ldg_h((__fp16 *)a); + resv2 += __nvvm_ldg_h2((__fp16v2 *)a); + + *out += __nvvm_ldu_h((__fp16 *)a); + resv2 += __nvvm_ldu_h2((__fp16v2 *)a); + + *out += resv2[0] + resv2[1]; } -// CHECK-ERROR: error: __nvvm_ldg_h requires native half type support. -// CHECK-ERROR: error: __nvvm_ldg_h2 requires native half type support. -// CHECK-ERROR: error: __nvvm_ldu_h requires native half type support. -// CHECK-ERROR: error: __nvvm_ldu_h2 requires native half type support. +// CHECK_ERROR: error: __nvvm_ex2_approx_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_ex2_approx_f16x2 requires native half type support. + +// CHECK_ERROR: error: __nvvm_fma_rn_relu_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_ftz_relu_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_relu_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_ftz_relu_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_ftz_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_sat_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_ftz_sat_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_ftz_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_sat_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fma_rn_ftz_sat_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_ftz_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_nan_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_ftz_nan_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_ftz_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_nan_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_ftz_nan_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_xorsign_abs_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_ftz_xorsign_abs_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_nan_xorsign_abs_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_ftz_nan_xorsign_abs_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_xorsign_abs_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_ftz_xorsign_abs_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_nan_xorsign_abs_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmin_ftz_nan_xorsign_abs_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_ftz_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_nan_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_ftz_nan_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_ftz_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_nan_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_ftz_nan_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_xorsign_abs_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_ftz_xorsign_abs_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_nan_xorsign_abs_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_ftz_nan_xorsign_abs_f16 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_xorsign_abs_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_ftz_xorsign_abs_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_nan_xorsign_abs_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_fmax_ftz_nan_xorsign_abs_f16x2 requires native half type support. +// CHECK_ERROR: error: __nvvm_ldg_h requires native half type support. +// CHECK_ERROR: error: __nvvm_ldg_h2 requires native half type support. +// CHECK_ERROR: error: __nvvm_ldu_h requires native half type support. +// CHECK_ERROR: error: __nvvm_ldu_h2 requires native half type support. Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -583,7 +583,6 @@ "_xorsign_abs_f16", "_ftz_xorsign_abs_f16", "_nan_xorsign_abs_f16", "_ftz_nan_xorsign_abs_f16"] in { def int_nvvm_f # operation # variant : - ClangBuiltin, DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty], [IntrNoMem, IntrSpeculatable, Commutative]>; } @@ -592,7 +591,6 @@ "_ftz_nan_f16x2", "_xorsign_abs_f16x2", "_ftz_xorsign_abs_f16x2", "_nan_xorsign_abs_f16x2", "_ftz_nan_xorsign_abs_f16x2"] in { def int_nvvm_f # operation # variant : - ClangBuiltin, DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty], [IntrNoMem, IntrSpeculatable, Commutative]>; } @@ -828,9 +826,9 @@ DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_nvvm_ex2_approx_d : ClangBuiltin<"__nvvm_ex2_approx_d">, DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; - def int_nvvm_ex2_approx_f16 : ClangBuiltin<"__nvvm_ex2_approx_f16">, + def int_nvvm_ex2_approx_f16 : DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>; - def int_nvvm_ex2_approx_f16x2 : ClangBuiltin<"__nvvm_ex2_approx_f16x2">, + def int_nvvm_ex2_approx_f16x2 : DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>; def int_nvvm_lg2_approx_ftz_f : ClangBuiltin<"__nvvm_lg2_approx_ftz_f">, @@ -860,18 +858,16 @@ foreach variant = ["_rn_f16", "_rn_ftz_f16", "_rn_sat_f16", "_rn_ftz_sat_f16", "_rn_relu_f16", "_rn_ftz_relu_f16"] in { - def int_nvvm_fma # variant : ClangBuiltin, - DefaultAttrsIntrinsic<[llvm_half_ty], - [llvm_half_ty, llvm_half_ty, llvm_half_ty], - [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_fma # variant : DefaultAttrsIntrinsic<[llvm_half_ty], + [llvm_half_ty, llvm_half_ty, llvm_half_ty], + [IntrNoMem, IntrSpeculatable]>; } foreach variant = ["_rn_f16x2", "_rn_ftz_f16x2", "_rn_sat_f16x2", "_rn_ftz_sat_f16x2", "_rn_relu_f16x2", "_rn_ftz_relu_f16x2"] in { - def int_nvvm_fma # variant : ClangBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], - [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty], - [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_fma # variant : DefaultAttrsIntrinsic<[llvm_v2f16_ty], + [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty], + [IntrNoMem, IntrSpeculatable]>; } foreach variant = ["_rn_bf16", "_rn_relu_bf16"] in {