Index: clang/include/clang/Basic/BuiltinsNVPTX.def =================================================================== --- clang/include/clang/Basic/BuiltinsNVPTX.def +++ clang/include/clang/Basic/BuiltinsNVPTX.def @@ -456,6 +456,17 @@ TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", PTX60) TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "WiUiWii*", "", PTX60) +// Redux +TARGET_BUILTIN(__nvvm_redux_sync_add_s32, "SiSii", "", SM_80) +TARGET_BUILTIN(__nvvm_redux_sync_min_s32, "SiSii", "", SM_80) +TARGET_BUILTIN(__nvvm_redux_sync_max_s32, "SiSii", "", SM_80) +TARGET_BUILTIN(__nvvm_redux_sync_add_u32, "UiUii", "", SM_80) +TARGET_BUILTIN(__nvvm_redux_sync_min_u32, "UiUii", "", SM_80) +TARGET_BUILTIN(__nvvm_redux_sync_max_u32, "UiUii", "", SM_80) +TARGET_BUILTIN(__nvvm_redux_sync_and_b32, "iii", "", SM_80) +TARGET_BUILTIN(__nvvm_redux_sync_xor_b32, "iii", "", SM_80) +TARGET_BUILTIN(__nvvm_redux_sync_or_b32, "iii", "", SM_80) + // Membar BUILTIN(__nvvm_membar_cta, "v", "") Index: clang/test/CodeGenCUDA/redux-builtins.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/redux-builtins.cu @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx70" "-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s +// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx70" "-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +// CHECK: define{{.*}} void @_Z6kernelPi(i32* %out) +__attribute__((global)) void kernel(int *out) { + int a = 1; + unsigned int b = 5; + int i = 0; + + out[i++] = __nvvm_redux_sync_add_s32(a, 0xFF); + // CHECK: call i32 @llvm.nvvm.redux.sync.add.s32 + + out[i++] = __nvvm_redux_sync_min_s32(a, 0x0F); + // CHECK: call i32 @llvm.nvvm.redux.sync.min.s32 + + out[i++] = __nvvm_redux_sync_max_s32(a, 0xF0); + // CHECK: call i32 @llvm.nvvm.redux.sync.max.s32 + + out[i++] = __nvvm_redux_sync_add_u32(b, 0x01); + // CHECK: call i32 @llvm.nvvm.redux.sync.add.u32 + + out[i++] = __nvvm_redux_sync_min_u32(b, 0xF0); + // CHECK: call i32 @llvm.nvvm.redux.sync.min.u32 + + out[i++] = __nvvm_redux_sync_max_u32(b, 0x0F); + // CHECK: call i32 @llvm.nvvm.redux.sync.max.u32 + + out[i++] = __nvvm_redux_sync_and_b32(a, 0xF0); + // CHECK: call i32 @llvm.nvvm.redux.sync.and.b32 + + out[i++] = __nvvm_redux_sync_and_b32(b, 0x0F); + // CHECK: call i32 @llvm.nvvm.redux.sync.and.b32 + + out[i++] = __nvvm_redux_sync_xor_b32(a, 0x10); + // CHECK: call i32 @llvm.nvvm.redux.sync.xor.b32 + + out[i++] = __nvvm_redux_sync_xor_b32(b, 0x01); + // CHECK: call i32 @llvm.nvvm.redux.sync.xor.b32 + + out[i++] = __nvvm_redux_sync_or_b32(a, 0xFF); + // CHECK: call i32 @llvm.nvvm.redux.sync.or.b32 + + out[i++] = __nvvm_redux_sync_or_b32(b, 0xFF); + // CHECK: call i32 @llvm.nvvm.redux.sync.or.b32 + + // CHECK: ret void +} Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4096,6 +4096,54 @@ Intrinsic<[llvm_i64_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">; +// +// REDUX.SYNC +// +// redux.sync.add.u32 dst, src, membermask; +def int_nvvm_redux_sync_add_u32 : GCCBuiltin<"__nvvm_redux_sync_add_u32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + +// redux.sync.min.u32 dst, src, membermask; +def int_nvvm_redux_sync_min_u32 : GCCBuiltin<"__nvvm_redux_sync_min_u32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + +// redux.sync.max.u32 dst, src, membermask; +def int_nvvm_redux_sync_max_u32 : GCCBuiltin<"__nvvm_redux_sync_max_u32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + +// redux.sync.add.s32 dst, src, membermask; +def int_nvvm_redux_sync_add_s32 : GCCBuiltin<"__nvvm_redux_sync_add_s32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + +// redux.sync.min.s32 dst, src, membermask; +def int_nvvm_redux_sync_min_s32 : GCCBuiltin<"__nvvm_redux_sync_min_s32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + +// redux.sync.max.s32 dst, src, membermask; +def int_nvvm_redux_sync_max_s32 : GCCBuiltin<"__nvvm_redux_sync_max_s32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + +// redux.sync.and.b32 dst, src, membermask; +def int_nvvm_redux_sync_and_b32 : GCCBuiltin<"__nvvm_redux_sync_and_b32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + +// redux.sync.xor.b32 dst, src, membermask; +def int_nvvm_redux_sync_xor_b32 : GCCBuiltin<"__nvvm_redux_sync_xor_b32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + +// redux.sync.or.b32 dst, src, membermask; +def int_nvvm_redux_sync_or_b32 : GCCBuiltin<"__nvvm_redux_sync_or_b32">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem]>; + // // WMMA instructions // Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -144,11 +144,13 @@ def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; +def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; +def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">; // non-sync shfl instructions are not available on sm_70+ in PTX6.4+ def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -274,6 +274,23 @@ defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC; +multiclass REDUX_SYNC { + def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask), + "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;", + [(set Int32Regs:$dst, (Intrin Int32Regs:$src, Int32Regs:$mask))]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm REDUX_SYNC_ADD_U32 : REDUX_SYNC<"add", "u32", int_nvvm_redux_sync_add_u32>; +defm REDUX_SYNC_MIN_U32 : REDUX_SYNC<"min", "u32", int_nvvm_redux_sync_min_u32>; +defm REDUX_SYNC_MAX_U32 : REDUX_SYNC<"max", "u32", int_nvvm_redux_sync_max_u32>; +defm REDUX_SYNC_ADD_S32 : REDUX_SYNC<"add", "s32", int_nvvm_redux_sync_add_s32>; +defm REDUX_SYNC_MIN_S32 : REDUX_SYNC<"min", "s32", int_nvvm_redux_sync_min_s32>; +defm REDUX_SYNC_MAX_S32 : REDUX_SYNC<"max", "s32", int_nvvm_redux_sync_max_s32>; +defm REDUX_SYNC_AND_B32 : REDUX_SYNC<"and", "b32", int_nvvm_redux_sync_and_b32>; +defm REDUX_SYNC_XOR_B32 : REDUX_SYNC<"xor", "b32", int_nvvm_redux_sync_xor_b32>; +defm REDUX_SYNC_OR_B32 : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or_b32>; + } // isConvergent = true //----------------------------------- Index: llvm/test/CodeGen/NVPTX/redux-sync.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/redux-sync.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s + +declare i32 @llvm.nvvm.redux.sync.add.u32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_add_u32 +define i32 @redux_sync_add_u32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.add.u32 + %val = call i32 @llvm.nvvm.redux.sync.add.u32(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.min.u32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_min_u32 +define i32 @redux_sync_min_u32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.min.u32 + %val = call i32 @llvm.nvvm.redux.sync.min.u32(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.max.u32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_max_u32 +define i32 @redux_sync_max_u32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.max.u32 + %val = call i32 @llvm.nvvm.redux.sync.max.u32(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.add.s32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_add_s32 +define i32 @redux_sync_add_s32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.add.s32 + %val = call i32 @llvm.nvvm.redux.sync.add.s32(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.min.s32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_min_s32 +define i32 @redux_sync_min_s32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.min.s32 + %val = call i32 @llvm.nvvm.redux.sync.min.s32(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.max.s32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_max_s32 +define i32 @redux_sync_max_s32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.max.s32 + %val = call i32 @llvm.nvvm.redux.sync.max.s32(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.and.b32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_and_b32 +define i32 @redux_sync_and_b32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.and.b32 + %val = call i32 @llvm.nvvm.redux.sync.and.b32(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.xor.b32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_xor_b32 +define i32 @redux_sync_xor_b32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.xor.b32 + %val = call i32 @llvm.nvvm.redux.sync.xor.b32(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.or.b32(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_or_b32 +define i32 @redux_sync_or_b32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.or.b32 + %val = call i32 @llvm.nvvm.redux.sync.or.b32(i32 %src, i32 %mask) + ret i32 %val +}