Index: llvm/include/llvm/IR/IntrinsicsNVVM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsNVVM.td +++ llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -276,6 +276,26 @@ ); } +class SHFL_INFO { + string Suffix = !if(sync, "sync_", "") + # mode # "_" + # type + # !if(return_pred, "p", ""); + + string Name = "int_nvvm_shfl_" # Suffix; + string Builtin = "__nvvm_shfl_" # Suffix; + string IntrName = "llvm.nvvm.shfl." # !subst("_",".", Suffix); + list withGccBuiltin = !if(return_pred, [], [1]); + list withoutGccBuiltin = !if(return_pred, [1], []); + LLVMType OpType = !cond( + !eq(type,"i32"): llvm_i32_ty, + !eq(type,"f32"): llvm_float_ty); + list RetTy = !if(return_pred, [OpType, llvm_i1_ty], [OpType]); + list ArgsTy = !if(sync, + [llvm_i32_ty, OpType, llvm_i32_ty, llvm_i32_ty], + [OpType, llvm_i32_ty, llvm_i32_ty]); +} + let TargetPrefix = "nvvm" in { def int_nvvm_prmt : GCCBuiltin<"__nvvm_prmt">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], @@ -3955,90 +3975,27 @@ // // SHUFFLE // - -// shfl.down.b32 dest, val, offset, mask_and_clamp -def int_nvvm_shfl_down_i32 : - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.i32">, - GCCBuiltin<"__nvvm_shfl_down_i32">; -def int_nvvm_shfl_down_f32 : - Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.f32">, - GCCBuiltin<"__nvvm_shfl_down_f32">; - -// shfl.up.b32 dest, val, offset, mask_and_clamp -def int_nvvm_shfl_up_i32 : - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.i32">, - GCCBuiltin<"__nvvm_shfl_up_i32">; -def int_nvvm_shfl_up_f32 : - Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.f32">, - GCCBuiltin<"__nvvm_shfl_up_f32">; - -// shfl.bfly.b32 dest, val, offset, mask_and_clamp -def int_nvvm_shfl_bfly_i32 : - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">, - GCCBuiltin<"__nvvm_shfl_bfly_i32">; -def int_nvvm_shfl_bfly_f32 : - Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">, - GCCBuiltin<"__nvvm_shfl_bfly_f32">; - -// shfl.idx.b32 dest, val, lane, mask_and_clamp -def int_nvvm_shfl_idx_i32 : - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.i32">, - GCCBuiltin<"__nvvm_shfl_idx_i32">; -def int_nvvm_shfl_idx_f32 : - Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.f32">, - GCCBuiltin<"__nvvm_shfl_idx_f32">; - -// Synchronizing shfl variants available in CUDA-9. -// On sm_70 these don't have to be convergent, so we may eventually want to -// implement non-convergent variant of this intrinsic. - -// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp -def int_nvvm_shfl_sync_down_i32 : - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">, - GCCBuiltin<"__nvvm_shfl_sync_down_i32">; -def int_nvvm_shfl_sync_down_f32 : - Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">, - GCCBuiltin<"__nvvm_shfl_sync_down_f32">; - -// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp -def int_nvvm_shfl_sync_up_i32 : - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">, - GCCBuiltin<"__nvvm_shfl_sync_up_i32">; -def int_nvvm_shfl_sync_up_f32 : - Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">, - GCCBuiltin<"__nvvm_shfl_sync_up_f32">; - -// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp -def int_nvvm_shfl_sync_bfly_i32 : - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">, - GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">; -def int_nvvm_shfl_sync_bfly_f32 : - Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">, - GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">; - -// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp -def int_nvvm_shfl_sync_idx_i32 : - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">, - GCCBuiltin<"__nvvm_shfl_sync_idx_i32">; -def int_nvvm_shfl_sync_idx_f32 : - Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">, - GCCBuiltin<"__nvvm_shfl_sync_idx_f32">; +// Generate intrinsics for all variants of shfl instruction. +foreach sync = [0, 1] in { + foreach mode = ["up", "down", "bfly", "idx"] in { + foreach type = ["i32", "f32"] in { + foreach return_pred = [0, 1] in { + foreach i = [SHFL_INFO] in { + foreach _ = i.withGccBuiltin in { + def i.Name : GCCBuiltin, + Intrinsic; + } + foreach _ = i.withoutGccBuiltin in { + def i.Name : Intrinsic; + } + } + } + } + } +} // // VOTE Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -143,12 +143,17 @@ def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; +def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; +// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ +def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" + "&& Subtarget->getPTXVersion() >= 64)">; + def useShortPtr : Predicate<"useShortPointers()">; def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; Index: llvm/lib/Target/NVPTX/NVPTXIntrinsics.td =================================================================== --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -56,6 +56,10 @@ []); } +class THREADMASK_INFO { + list ret = !if(sync, [0,1], [0]); +} + //----------------------------------- // Synchronization and shuffle functions //----------------------------------- @@ -129,121 +133,64 @@ [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>, Requires<[hasPTX60, hasSM30]>; - -// shfl.{up,down,bfly,idx}.b32 -multiclass SHFL { - // The last two parameters to shfl can be regs or imms. ptxas is smart - // enough to inline constant registers, so strictly speaking we don't need to - // handle immediates here. But it's easy enough, and it makes our ptx more - // readable. - def reg : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>; - - def imm1 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>; - - def imm2 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>; - - def imm3 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>; +class SHFL_INSTR + : NVPTXInst<(outs), (ins), "?", []> { + NVPTXRegClass rc = !cond( + !eq(reg, "i32"): Int32Regs, + !eq(reg, "f32"): Float32Regs); + string IntrName = "int_nvvm_shfl_" + # !if(sync, "sync_", "") + # mode + # "_" # reg + # !if(return_pred, "p", ""); + Intrinsic Intr = !cast(IntrName); + let InOperandList = !con( + !if(sync, + !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]), + (ins)), + (ins rc:$src), + !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]), + !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"]) + ); + let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst)); + let AsmString = "shfl." + # !if(sync, "sync.", "") + # mode # ".b32\t" + # "$dst" + # !if(return_pred, "|$pred", "") # ", " + # "$src, $offset, $mask" + # !if(sync, ", $threadmask", "") + # ";" + ; + let Pattern = [!con( + !foreach(tmp, OutOperandList, + !subst(outs, set, + !subst(i32imm, imm, tmp))), + (set !foreach(tmp, InOperandList, + !subst(ins, Intr, + !subst(i32imm, imm, tmp)))) + )]; } -defm INT_SHFL_DOWN_I32 : SHFL; -defm INT_SHFL_DOWN_F32 : SHFL; -defm INT_SHFL_UP_I32 : SHFL; -defm INT_SHFL_UP_F32 : SHFL; -defm INT_SHFL_BFLY_I32 : SHFL; -defm INT_SHFL_BFLY_F32 : SHFL; -defm INT_SHFL_IDX_I32 : SHFL; -defm INT_SHFL_IDX_F32 : SHFL; - -multiclass SHFL_SYNC { - // Threadmask and the last two parameters to shfl.sync can be regs or imms. - // ptxas is smart enough to inline constant registers, so strictly speaking we - // don't need to handle immediates here. But it's easy enough, and it makes - // our ptx more readable. - def rrr : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - Int32Regs:$offset, Int32Regs:$mask))]>; - - def rri : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - Int32Regs:$offset, imm:$mask))]>; - - def rir : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - imm:$offset, Int32Regs:$mask))]>; - - def rii : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - imm:$offset, imm:$mask))]>; - - def irr : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - Int32Regs:$offset, Int32Regs:$mask))]>; - - def iri : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - Int32Regs:$offset, imm:$mask))]>; - - def iir : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - imm:$offset, Int32Regs:$mask))]>; - - def iii : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - imm:$offset, imm:$mask))]>; +foreach sync = [0, 1] in { + foreach mode = ["up", "down", "bfly", "idx"] in { + foreach regclass = ["i32", "f32"] in { + foreach return_pred = [0, 1] in { + foreach offset_imm = [0, 1] in { + foreach mask_imm = [0, 1] in { + foreach threadmask_imm = THREADMASK_INFO.ret in { + def : SHFL_INSTR, + Requires; + } + } + } + } + } + } } -// On sm_70 these don't have to be convergent, so we may eventually want to -// implement non-convergent variant of this intrinsic. -defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC; -defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC; -defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC; -defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC; -defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC; -defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC; -defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC; -defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC; - - // vote.{all,any,uni,ballot} multiclass VOTE { def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred), Index: llvm/test/CodeGen/NVPTX/shfl-p.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/shfl-p.ll @@ -0,0 +1,172 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s + +declare {i32, i1} @llvm.nvvm.shfl.down.i32p(i32, i32, i32) +declare {float, i1} @llvm.nvvm.shfl.down.f32p(float, i32, i32) +declare {i32, i1} @llvm.nvvm.shfl.up.i32p(i32, i32, i32) +declare {float, i1} @llvm.nvvm.shfl.up.f32p(float, i32, i32) +declare {i32, i1} @llvm.nvvm.shfl.bfly.i32p(i32, i32, i32) +declare {float, i1} @llvm.nvvm.shfl.bfly.f32p(float, i32, i32) +declare {i32, i1} @llvm.nvvm.shfl.idx.i32p(i32, i32, i32) +declare {float, i1} @llvm.nvvm.shfl.idx.f32p(float, i32, i32) + +; CHECK-LABEL: .func{{.*}}shfl.i32.rrr +define {i32, i1} @shfl.i32.rrr(i32 %a, i32 %b, i32 %c) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.i32.irr +define {i32, i1} @shfl.i32.irr(i32 %a, i32 %b, i32 %c) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.i32.rri +define {i32, i1} @shfl.i32.rri(i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 1) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.i32.iri +define {i32, i1} @shfl.i32.iri(i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 2) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.i32.rir +define {i32, i1} @shfl.i32.rir(i32 %a, i32 %c) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 %c) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.i32.iir +define {i32, i1} @shfl.i32.iir(i32 %a, i32 %c) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 %c) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.i32.rii +define {i32, i1} @shfl.i32.rii(i32 %a) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 2) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.i32.iii +define {i32, i1} @shfl.i32.iii(i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 3) + ret {i32, i1} %val +} + +;; Same intrinsics, but for float + +; CHECK-LABEL: .func{{.*}}shfl.f32.rrr +define {float, i1} @shfl.f32.rrr(float %a, i32 %b, i32 %c) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.f32.irr +define {float, i1} @shfl.f32.irr(float %a, i32 %b, i32 %c) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.f32.rri +define {float, i1} @shfl.f32.rri(float %a, i32 %b) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 1) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.f32.iri +define {float, i1} @shfl.f32.iri(float %a, i32 %b) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 2) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.f32.rir +define {float, i1} @shfl.f32.rir(float %a, i32 %c) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 %c) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.f32.iir +define {float, i1} @shfl.f32.iir(float %a, i32 %c) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 %c) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.f32.rii +define {float, i1} @shfl.f32.rii(float %a) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 2) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.f32.iii +define {float, i1} @shfl.f32.iii(float %a, i32 %b) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 3) + ret {float, i1} %val +} Index: llvm/test/CodeGen/NVPTX/shfl-sync-p.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/NVPTX/shfl-sync-p.ll @@ -0,0 +1,180 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s + +declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32) +declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32) +declare {i32, i1} @llvm.nvvm.shfl.sync.up.i32p(i32, i32, i32, i32) +declare {float, i1} @llvm.nvvm.shfl.sync.up.f32p(i32, float, i32, i32) +declare {i32, i1} @llvm.nvvm.shfl.sync.bfly.i32p(i32, i32, i32, i32) +declare {float, i1} @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32) +declare {i32, i1} @llvm.nvvm.shfl.sync.idx.i32p(i32, i32, i32, i32) +declare {float, i1} @llvm.nvvm.shfl.sync.idx.f32p(i32, float, i32, i32) + +; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rrr +define {i32, i1} @shfl.sync.i32.rrr(i32 %mask, i32 %a, i32 %b, i32 %c) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 %c) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.i32.irr +define {i32, i1} @shfl.sync.i32.irr(i32 %a, i32 %b, i32 %c) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 %c) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rri +define {i32, i1} @shfl.sync.i32.rri(i32 %mask, i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 1) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iri +define {i32, i1} @shfl.sync.i32.iri(i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 2) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rir +define {i32, i1} @shfl.sync.i32.rir(i32 %mask, i32 %a, i32 %c) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 %c) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iir +define {i32, i1} @shfl.sync.i32.iir(i32 %a, i32 %c) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 %c) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rii +define {i32, i1} @shfl.sync.i32.rii(i32 %mask, i32 %a) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 2) + ret {i32, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iii +define {i32, i1} @shfl.sync.i32.iii(i32 %a, i32 %b) { + ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 3) + ret {i32, i1} %val +} + +;; Same intrinsics, but for float + +; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rrr +define {float, i1} @shfl.sync.f32.rrr(i32 %mask, float %a, i32 %b, i32 %c) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 %c) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.f32.irr +define {float, i1} @shfl.sync.f32.irr(float %a, i32 %b, i32 %c) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 %c) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rri +define {float, i1} @shfl.sync.f32.rri(i32 %mask, float %a, i32 %b) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 1) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iri +define {float, i1} @shfl.sync.f32.iri(float %a, i32 %b) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 2) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rir +define {float, i1} @shfl.sync.f32.rir(i32 %mask, float %a, i32 %c) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 %c) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iir +define {float, i1} @shfl.sync.f32.iir(float %a, i32 %c) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 %c) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rii +define {float, i1} @shfl.sync.f32.rii(i32 %mask, float %a) { + ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]]; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 2) + ret {float, i1} %val +} + +; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iii +define {float, i1} @shfl.sync.f32.iii(float %a, i32 %b) { + ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1; + ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] + %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 3) + ret {float, i1} %val +}