Index: clang/lib/Headers/__clang_cuda_runtime_wrapper.h =================================================================== --- clang/lib/Headers/__clang_cuda_runtime_wrapper.h +++ clang/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -100,11 +100,47 @@ #undef __CUDACC__ #define __CUDABE__ -// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does -// not have at the moment. Emulate them with a builtin memcpy/memset. +// CUDA headers use __nvvm_memcpy and __nvvm_memset, which Clang does not have +// at the moment. Emulate them with a builtin memcpy/memset. #define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n) #define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n) +// The following NVVM builtins have been removed from clang and LLVM entirely, +// as they correspond exactly to existing clang builtins. They're emulated here +// so we don't break the CUDA headers (and any user code that might have used +// them). +#define __nvvm_brev32(__a) __builtin_bitreverse32(__a) +#define __nvvm_brev64(__a) __builtin_bitreverse64(__a) +#define __nvvm_clz_i(__a) __builtin_ctlz(__a) +#define __nvvm_clz_ll(__a) __builtin_ctlzll(__a) +#define __nvvm_popc_i(__a) __builtin_popcount(__a) +#define __nvvm_popc_ll(__a) __builtin_popcountll(__a) +#define __nvvm_abs_i(__a) __builtin_abs(__a) +#define __nvvm_abs_ll(__a) __builtin_llabs(__a) +inline int __nvvm_max_i(int __a, int __b) { return __a >= __b ? __a : __b; } +inline unsigned int __nvvm_max_ui(unsigned int __a, unsigned int __b) { + return __a >= __b ? __a : __b; +} +inline long long __nvvm_max_i(long long __a, long long __b) { + return __a >= __b ? __a : __b; +} +inline unsigned long long __nvvm_max_ui(unsigned long long __a, + unsigned long long __b) { + return __a >= __b ? __a : __b; +} +inline int __nvvm_min_i(int __a, int __b) { return __a < __b ? __a : __b; } +inline unsigned int __nvvm_min_ui(unsigned int __a, unsigned int __b) { + return __a < __b ? __a : __b; +} +inline long long __nvvm_min_i(long long __a, long long __b) { + return __a < __b ? __a : __b; +} +inline unsigned long long __nvvm_min_ui(unsigned long long __a, + unsigned long long __b) { + return __a < __b ? __a : __b; +} +inline float __nvvm_h2f(short __a) __asm("llvm.convert.from.fp16"); + #include "crt/device_runtime.h" #include "crt/host_runtime.h" // device_runtime.h defines __cxa_* macros that will conflict with Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -18,16 +18,6 @@ // let TargetPrefix = "nvvm" in { - def int_nvvm_clz_i : GCCBuiltin<"__nvvm_clz_i">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_nvvm_clz_ll : GCCBuiltin<"__nvvm_clz_ll">, - Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; - - def int_nvvm_popc_i : GCCBuiltin<"__nvvm_popc_i">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_nvvm_popc_ll : GCCBuiltin<"__nvvm_popc_ll">, - Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; - def int_nvvm_prmt : GCCBuiltin<"__nvvm_prmt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, Commutative]>; @@ -36,34 +26,6 @@ // Min Max // - def int_nvvm_min_i : GCCBuiltin<"__nvvm_min_i">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, Commutative]>; - def int_nvvm_min_ui : GCCBuiltin<"__nvvm_min_ui">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, Commutative]>; - - def int_nvvm_min_ll : GCCBuiltin<"__nvvm_min_ll">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem, Commutative]>; - def int_nvvm_min_ull : GCCBuiltin<"__nvvm_min_ull">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem, Commutative]>; - - def int_nvvm_max_i : GCCBuiltin<"__nvvm_max_i">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, Commutative]>; - def int_nvvm_max_ui : GCCBuiltin<"__nvvm_max_ui">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, Commutative]>; - - def int_nvvm_max_ll : GCCBuiltin<"__nvvm_max_ll">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem, Commutative]>; - def int_nvvm_max_ull : GCCBuiltin<"__nvvm_max_ull">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem, Commutative]>; - def int_nvvm_fmin_f : GCCBuiltin<"__nvvm_fmin_f">, Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, Commutative]>; @@ -200,15 +162,6 @@ Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty], [IntrNoMem, Commutative]>; -// -// Brev -// - - def int_nvvm_brev32 : GCCBuiltin<"__nvvm_brev32">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_nvvm_brev64 : GCCBuiltin<"__nvvm_brev64">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem]>; - // // Sad // @@ -242,16 +195,10 @@ // Abs // - def int_nvvm_abs_i : GCCBuiltin<"__nvvm_abs_i">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_nvvm_abs_ll : GCCBuiltin<"__nvvm_abs_ll">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem]>; - def int_nvvm_fabs_ftz_f : GCCBuiltin<"__nvvm_fabs_ftz_f">, Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_nvvm_fabs_f : GCCBuiltin<"__nvvm_fabs_f">, Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">, Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>; Index: llvm/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/lib/IR/AutoUpgrade.cpp +++ llvm/lib/IR/AutoUpgrade.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/AutoUpgrade.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/IR/CFG.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -204,7 +205,38 @@ } break; } + case 'n': { + if (Name.startswith("nvvm.")) { + Name = Name.substr(5); + // The following nvvm intrinsics correspond exactly to an LLVM intrinsic. + Intrinsic::ID IID = StringSwitch(Name) + .Cases("brev32", "brev64", Intrinsic::bitreverse) + .Case("clz.i", Intrinsic::ctlz) + .Case("popc.i", Intrinsic::ctpop) + .Default(Intrinsic::not_intrinsic); + if (IID != Intrinsic::not_intrinsic && F->arg_size() == 1) { + NewFn = Intrinsic::getDeclaration(F->getParent(), IID, + {F->getReturnType()}); + return true; + } + + // The following nvvm intrinsics correspond exactly to an LLVM idiom, but + // not to an intrinsic alone. We expand them in UpgradeIntrinsicCall. + // + // TODO: We could add lohi.i2d. + bool Expand = StringSwitch(Name) + .Cases("abs.i", "abs.ll", true) + .Cases("clz.ll", "popc.ll", "h2f", true) + .Cases("max.i", "max.ll", "max.ui", "max.ull", true) + .Cases("min.i", "min.ll", "min.ui", "min.ull", true) + .Default(false); + if (Expand) { + NewFn = nullptr; + return true; + } + } + } case 'o': // We only need to change the name to match the mangling including the // address space. @@ -753,6 +785,9 @@ bool IsX86 = Name.startswith("x86."); if (IsX86) Name = Name.substr(4); + bool IsNVVM = Name.startswith("nvvm."); + if (IsNVVM) + Name = Name.substr(5); if (IsX86 && Name.startswith("sse4a.movnt.")) { Module *M = F->getParent(); @@ -1727,6 +1762,50 @@ { CI->getArgOperand(0), CI->getArgOperand(1) }); Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); + } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) { + Value *Arg = CI->getArgOperand(0); + Value *Neg = Builder.CreateNeg(Arg, "neg"); + Value *Cmp = Builder.CreateICmpSGE( + Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond"); + Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs"); + } else if (IsNVVM && (Name == "max.i" || Name == "max.ll" || + Name == "max.ui" || Name == "max.ull")) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Cmp = Name.endswith(".ui") || Name.endswith(".ull") + ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond") + : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond"); + Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max"); + } else if (IsNVVM && (Name == "min.i" || Name == "min.ll" || + Name == "min.ui" || Name == "min.ull")) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Cmp = Name.endswith(".ui") || Name.endswith(".ull") + ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond") + : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond"); + Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min"); + } else if (IsNVVM && Name == "clz.ll") { + // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 and returns an i64. + Value *Arg = CI->getArgOperand(0); + Value *Ctlz = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, + {Arg->getType()}), + {Arg, Builder.getFalse()}, "ctlz"); + Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc"); + } else if (IsNVVM && Name == "popc.ll") { + // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 and returns an + // i64. + Value *Arg = CI->getArgOperand(0); + Value *Popc = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop, + {Arg->getType()}), + Arg, "ctpop"); + Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc"); + } else if (IsNVVM && Name == "h2f") { + Rep = Builder.CreateCall(Intrinsic::getDeclaration( + F->getParent(), Intrinsic::convert_from_fp16, + {Builder.getFloatTy()}), + CI->getArgOperand(0), "h2f"); } else { llvm_unreachable("Unknown function for CallInst upgrade."); } @@ -1786,11 +1865,15 @@ CI->eraseFromParent(); return; - case Intrinsic::ctpop: { + case Intrinsic::ctpop: + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)})); + CI->eraseFromParent(); + return; + + case Intrinsic::convert_from_fp16: CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)})); CI->eraseFromParent(); return; - } case Intrinsic::x86_xop_vfrcz_ss: case Intrinsic::x86_xop_vfrcz_sd: Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -187,16 +187,6 @@ // MISC // -def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs, - int_nvvm_clz_i>; -def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs, - int_nvvm_clz_ll>; - -def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs, - int_nvvm_popc_i>; -def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs, - int_nvvm_popc_ll>; - def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>; @@ -204,26 +194,6 @@ // Min Max // -def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs, - Int32Regs, Int32Regs, int_nvvm_min_i>; -def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs, - Int32Regs, Int32Regs, int_nvvm_min_ui>; - -def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs, - Int64Regs, Int64Regs, int_nvvm_min_ll>; -def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs, - Int64Regs, Int64Regs, int_nvvm_min_ull>; - -def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs, - Int32Regs, Int32Regs, int_nvvm_max_i>; -def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs, - Int32Regs, Int32Regs, int_nvvm_max_ui>; - -def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs, - Int64Regs, Int64Regs, int_nvvm_max_ll>; -def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs, - Int64Regs, Int64Regs, int_nvvm_max_ull>; - def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_f>; def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;", @@ -239,6 +209,7 @@ def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs, Float64Regs, Float64Regs, int_nvvm_fmax_d>; + // // Multiplication // @@ -320,15 +291,6 @@ def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;", Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>; -// -// Brev -// - -def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs, - int_nvvm_brev32>; -def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs, - int_nvvm_brev64>; - // // Sad // @@ -360,11 +322,6 @@ // Abs // -def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs, - int_nvvm_abs_i>; -def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs, - int_nvvm_abs_ll>; - def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs, int_nvvm_fabs_ftz_f>; def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs, Index: llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -0,0 +1,102 @@ +; Test to make sure NVVM intrinsics are automatically upgraded. +; RUN: llvm-as < %s | llvm-dis | FileCheck %s +; RUN: verify-uselistorder %s + +declare i32 @llvm.nvvm.brev32(i32) +declare i64 @llvm.nvvm.brev64(i64) +declare i32 @llvm.nvvm.clz.i(i32) +declare i32 @llvm.nvvm.clz.ll(i64) +declare i32 @llvm.nvvm.popc.i(i32) +declare i32 @llvm.nvvm.popc.ll(i64) +declare float @llvm.nvvm.h2f(i16) + +declare i32 @llvm.nvvm.abs.i(i32) +declare i64 @llvm.nvvm.abs.ll(i64) + +declare i32 @llvm.nvvm.max.i(i32, i32) +declare i64 @llvm.nvvm.max.ll(i64, i64) +declare i32 @llvm.nvvm.max.ui(i32, i32) +declare i64 @llvm.nvvm.max.ull(i64, i64) +declare i32 @llvm.nvvm.min.i(i32, i32) +declare i64 @llvm.nvvm.min.ll(i64, i64) +declare i32 @llvm.nvvm.min.ui(i32, i32) +declare i64 @llvm.nvvm.min.ull(i64, i64) + +; CHECK-LABEL: @simple_upgrade +define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { +; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) + %r1 = call i32 @llvm.nvvm.brev32(i32 %a) + +; CHECK: call i64 @llvm.bitreverse.i64(i64 %b) + %r2 = call i64 @llvm.nvvm.brev64(i64 %b) + +; CHECK: call i32 @llvm.ctlz.i32(i32 %a, i1 false) + %r3 = call i32 @llvm.nvvm.clz.i(i32 %a) + +; CHECK: [[clz:%[a-zA-Z0-9.]+]] = call i64 @llvm.ctlz.i64(i64 %b, i1 false) +; CHECK: trunc i64 [[clz]] to i32 + %r4 = call i32 @llvm.nvvm.clz.ll(i64 %b) + +; CHECK: call i32 @llvm.ctpop.i32(i32 %a) + %r5 = call i32 @llvm.nvvm.popc.i(i32 %a) + +; CHECK: [[popc:%[a-zA-Z0-9.]+]] = call i64 @llvm.ctpop.i64(i64 %b) +; CHECK: trunc i64 [[popc]] to i32 + %r6 = call i32 @llvm.nvvm.popc.ll(i64 %b) + +; CHECK: call float @llvm.convert.from.fp16.f32(i16 %c) + %r7 = call float @llvm.nvvm.h2f(i16 %c) + ret void +} + +; CHECK-LABEL @abs +define void @abs(i32 %a, i64 %b) { +; CHECK-DAG: [[negi:%[a-zA-Z0-9.]+]] = sub i32 0, %a +; CHECK-DAG: [[cmpi:%[a-zA-Z0-9.]+]] = icmp sge i32 %a, 0 +; CHECK: select i1 [[cmpi]], i32 %a, i32 [[negi]] + %r1 = call i32 @llvm.nvvm.abs.i(i32 %a) + +; CHECK-DAG: [[negll:%[a-zA-Z0-9.]+]] = sub i64 0, %b +; CHECK-DAG: [[cmpll:%[a-zA-Z0-9.]+]] = icmp sge i64 %b, 0 +; CHECK: select i1 [[cmpll]], i64 %b, i64 [[negll]] + %r2 = call i64 @llvm.nvvm.abs.ll(i64 %b) + + ret void +} + +; CHECK-LABEL: @min_max +define void @min_max(i32 %a1, i32 %a2, i64 %b1, i64 %b2) { +; CHECK: [[maxi:%[a-zA-Z0-9.]+]] = icmp sge i32 %a1, %a2 +; CHECK: select i1 [[maxi]], i32 %a1, i32 %a2 + %r1 = call i32 @llvm.nvvm.max.i(i32 %a1, i32 %a2) + +; CHECK: [[maxll:%[a-zA-Z0-9.]+]] = icmp sge i64 %b1, %b2 +; CHECK: select i1 [[maxll]], i64 %b1, i64 %b2 + %r2 = call i64 @llvm.nvvm.max.ll(i64 %b1, i64 %b2) + +; CHECK: [[maxui:%[a-zA-Z0-9.]+]] = icmp uge i32 %a1, %a2 +; CHECK: select i1 [[maxui]], i32 %a1, i32 %a2 + %r3 = call i32 @llvm.nvvm.max.ui(i32 %a1, i32 %a2) + +; CHECK: [[maxull:%[a-zA-Z0-9.]+]] = icmp uge i64 %b1, %b2 +; CHECK: select i1 [[maxull]], i64 %b1, i64 %b2 + %r4 = call i64 @llvm.nvvm.max.ull(i64 %b1, i64 %b2) + +; CHECK: [[mini:%[a-zA-Z0-9.]+]] = icmp sle i32 %a1, %a2 +; CHECK: select i1 [[mini]], i32 %a1, i32 %a2 + %r5 = call i32 @llvm.nvvm.min.i(i32 %a1, i32 %a2) + +; CHECK: [[minll:%[a-zA-Z0-9.]+]] = icmp sle i64 %b1, %b2 +; CHECK: select i1 [[minll]], i64 %b1, i64 %b2 + %r6 = call i64 @llvm.nvvm.min.ll(i64 %b1, i64 %b2) + +; CHECK: [[minui:%[a-zA-Z0-9.]+]] = icmp ule i32 %a1, %a2 +; CHECK: select i1 [[minui]], i32 %a1, i32 %a2 + %r7 = call i32 @llvm.nvvm.min.ui(i32 %a1, i32 %a2) + +; CHECK: [[minull:%[a-zA-Z0-9.]+]] = icmp ule i64 %b1, %b2 +; CHECK: select i1 [[minull]], i64 %b1, i64 %b2 + %r8 = call i64 @llvm.nvvm.min.ull(i64 %b1, i64 %b2) + + ret void +}