Index: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def @@ -371,6 +371,9 @@ BUILTIN(__nvvm_bitcast_ll2d, "dLLi", "") BUILTIN(__nvvm_bitcast_d2ll, "LLid", "") +// FNS +TARGET_BUILTIN(__nvvm_fns, "UiUiUii", "n", "ptx60") + // Sync BUILTIN(__syncthreads, "v", "") Index: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h =================================================================== --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h @@ -206,6 +206,10 @@ inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } +inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { + return __nvvm_fns(mask, base, offset); +} + #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 // Define __match* builtins CUDA-9 headers expect to see. Index: llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td @@ -682,6 +682,11 @@ def int_nvvm_bitcast_d2ll : GCCBuiltin<"__nvvm_bitcast_d2ll">, Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>; +// FNS + + def int_nvvm_fns : GCCBuiltin<"__nvvm_fns">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; // Atomics not available as llvm intrinsics. def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty], Index: llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -979,6 +979,33 @@ def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs, Float64Regs, int_nvvm_bitcast_d2ll>; +// +// FNS +// + +class INT_FNS_MBO + : NVPTXInst<(outs Int32Regs:$dst), ins, + "fns.b32 \t$dst, $mask, $base, $offset;", + [(set Int32Regs:$dst, Operands )]>, + Requires<[hasPTX60, hasSM30]>; + +def INT_FNS_rrr : INT_FNS_MBO<(ins Int32Regs:$mask, Int32Regs:$base, Int32Regs:$offset), + (int_nvvm_fns Int32Regs:$mask, Int32Regs:$base, Int32Regs:$offset)>; +def INT_FNS_rri : INT_FNS_MBO<(ins Int32Regs:$mask, Int32Regs:$base, i32imm:$offset), + (int_nvvm_fns Int32Regs:$mask, Int32Regs:$base, imm:$offset)>; +def INT_FNS_rir : INT_FNS_MBO<(ins Int32Regs:$mask, i32imm:$base, Int32Regs:$offset), + (int_nvvm_fns Int32Regs:$mask, imm:$base, Int32Regs:$offset)>; +def INT_FNS_rii : INT_FNS_MBO<(ins Int32Regs:$mask, i32imm:$base, i32imm:$offset), + (int_nvvm_fns Int32Regs:$mask, imm:$base, imm:$offset)>; +def INT_FNS_irr : INT_FNS_MBO<(ins i32imm:$mask, Int32Regs:$base, Int32Regs:$offset), + (int_nvvm_fns imm:$mask, Int32Regs:$base, Int32Regs:$offset)>; +def INT_FNS_iri : INT_FNS_MBO<(ins i32imm:$mask, Int32Regs:$base, i32imm:$offset), + (int_nvvm_fns imm:$mask, Int32Regs:$base, imm:$offset)>; +def INT_FNS_iir : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, Int32Regs:$offset), + (int_nvvm_fns imm:$mask, imm:$base, Int32Regs:$offset)>; +def INT_FNS_iii : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, i32imm:$offset), + (int_nvvm_fns imm:$mask, imm:$base, imm:$offset)>; + //----------------------------------- // Atomic Functions //----------------------------------- Index: llvm/trunk/test/CodeGen/NVPTX/fns.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/fns.ll +++ llvm/trunk/test/CodeGen/NVPTX/fns.ll @@ -0,0 +1,36 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s + +declare i32 @llvm.nvvm.fns(i32, i32, i32) + +; CHECK-LABEL: .func{{.*}}fns +define i32 @fns(i32 %mask, i32 %base, i32 %offset) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [fns_param_0]; + ; CHECK: ld.param.u32 [[BASE:%r[0-9]+]], [fns_param_1]; + ; CHECK: ld.param.u32 [[OFFSET:%r[0-9]+]], [fns_param_2]; + + ; CHECK: fns.b32 {{%r[0-9]+}}, [[MASK]], [[BASE]], [[OFFSET]]; + %r0 = call i32 @llvm.nvvm.fns(i32 %mask, i32 %base, i32 %offset); + ; CHECK: fns.b32 {{%r[0-9]+}}, [[MASK]], [[BASE]], 0; + %r1 = call i32 @llvm.nvvm.fns(i32 %mask, i32 %base, i32 0); + %r01 = add i32 %r0, %r1; + ; CHECK: fns.b32 {{%r[0-9]+}}, [[MASK]], 1, [[OFFSET]]; + %r2 = call i32 @llvm.nvvm.fns(i32 %mask, i32 1, i32 %offset); + ; CHECK: fns.b32 {{%r[0-9]+}}, [[MASK]], 1, 0; + %r3 = call i32 @llvm.nvvm.fns(i32 %mask, i32 1, i32 0); + %r23 = add i32 %r2, %r3; + %r0123 = add i32 %r01, %r23; + ; CHECK: fns.b32 {{%r[0-9]+}}, 2, [[BASE]], [[OFFSET]]; + %r4 = call i32 @llvm.nvvm.fns(i32 2, i32 %base, i32 %offset); + ; CHECK: fns.b32 {{%r[0-9]+}}, 2, [[BASE]], 0; + %r5 = call i32 @llvm.nvvm.fns(i32 2, i32 %base, i32 0); + %r45 = add i32 %r4, %r5; + ; CHECK: fns.b32 {{%r[0-9]+}}, 2, 1, [[OFFSET]]; + %r6 = call i32 @llvm.nvvm.fns(i32 2, i32 1, i32 %offset); + ; CHECK: fns.b32 {{%r[0-9]+}}, 2, 1, 0; + %r7 = call i32 @llvm.nvvm.fns(i32 2, i32 1, i32 0); + %r67 = add i32 %r6, %r7; + %r4567 = add i32 %r45, %r67; + %r = add i32 %r0123, %r4567; + ret i32 %r; +} +