This is an archive of the discontinued LLVM Phabricator instance.

Differential D21160

[NVPTX] Add intrinsics for shfl instructions.
ClosedPublic

Authored by jlebar on Jun 8 2016, 5:21 PM.

Download Raw Diff

Details

Reviewers

tra

Commits

rGed2c282d4b1d: [NVPTX] Add intrinsics for shfl instructions.
rL272298: [NVPTX] Add intrinsics for shfl instructions.

Summary

Currently clang emits these instructions via inline (volatile) asm in
the CUDA headers. Switching to intrinsics will let the optimizer reason
across calls to these intrinsics.

Diff Detail

Repository: rL LLVM

Event Timeline

jlebar updated this revision to Diff 60123.Jun 8 2016, 5:21 PM

jlebar retitled this revision from to [NVPTX] Add intrinsics for shfl instructions..

jlebar updated this object.

jlebar added a reviewer: tra.

jlebar mentioned this in D21162: [CUDA] Implement __shfl* intrinsics in clang headers..

jlebar added subscribers: jholewinski, llvm-commits.

Looks good to me!

LGTM

test/CodeGen/NVPTX/shfl.ll
19 ↗	(On Diff #60123)	I'm curious why {{.}}32 here? Do you expect return type to change?

This revision is now accepted and ready to land.Jun 9 2016, 11:02 AM

jlebar marked an inline comment as done.Jun 9 2016, 12:50 PM

jlebar added inline comments.

test/CodeGen/NVPTX/shfl.ll
19 ↗	(On Diff #60123)	It's currently a b32, but there's no reason (afaict) that it couldn't be a u32 (or i32). I didn't want to tie this test to the current behavior, since I don't think it matters.

Closed by commit rL272298: [NVPTX] Add intrinsics for shfl instructions. (authored by jlebar). · Explain WhyJun 9 2016, 1:10 PM

This revision was automatically updated to reflect the committed changes.

jlebar marked an inline comment as done.

jlebar mentioned this in rL272299: [CUDA] Implement __shfl* intrinsics in clang headers..

Revision Contents

Path

Size

llvm/

trunk/

include/

llvm/

IR/

IntrinsicsNVVM.td

44 lines

lib/

Target/

NVPTX/

NVPTXIntrinsics.td

43 lines

test/

CodeGen/

NVPTX/

shfl.ll

90 lines

Diff 60229

llvm/trunk/include/llvm/IR/IntrinsicsNVVM.td

Show First 20 Lines • Show All 3,740 Lines • ▼ Show 20 Lines	def int_ptx_read_pm1 : PTXReadSpecialRegisterIntrinsic_r32
<"__builtin_ptx_read_pm1">;		<"__builtin_ptx_read_pm1">;
def int_ptx_read_pm2 : PTXReadSpecialRegisterIntrinsic_r32		def int_ptx_read_pm2 : PTXReadSpecialRegisterIntrinsic_r32
<"__builtin_ptx_read_pm2">;		<"__builtin_ptx_read_pm2">;
def int_ptx_read_pm3 : PTXReadSpecialRegisterIntrinsic_r32		def int_ptx_read_pm3 : PTXReadSpecialRegisterIntrinsic_r32
<"__builtin_ptx_read_pm3">;		<"__builtin_ptx_read_pm3">;

def int_ptx_bar_sync : Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>,		def int_ptx_bar_sync : Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>,
GCCBuiltin<"__builtin_ptx_bar_sync">;		GCCBuiltin<"__builtin_ptx_bar_sync">;

		//
		// SHUFFLE
		//

		// shfl.down.b32 dest, val, offset, mask_and_clamp
		def int_ptx_shfl_down_i32 :
		Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.i32">,
		GCCBuiltin<"__builtin_ptx_shfl_down_i32">;
		def int_ptx_shfl_down_f32 :
		Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.f32">,
		GCCBuiltin<"__builtin_ptx_shfl_down_f32">;

		// shfl.up.b32 dest, val, offset, mask_and_clamp
		def int_ptx_shfl_up_i32 :
		Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.i32">,
		GCCBuiltin<"__builtin_ptx_shfl_up_i32">;
		def int_ptx_shfl_up_f32 :
		Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.f32">,
		GCCBuiltin<"__builtin_ptx_shfl_up_f32">;

		// shfl.bfly.b32 dest, val, offset, mask_and_clamp
		def int_ptx_shfl_bfly_i32 :
		Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">,
		GCCBuiltin<"__builtin_ptx_shfl_bfly_i32">;
		def int_ptx_shfl_bfly_f32 :
		Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">,
		GCCBuiltin<"__builtin_ptx_shfl_bfly_f32">;

		// shfl.idx.b32 dest, val, lane, mask_and_clamp
		def int_ptx_shfl_idx_i32 :
		Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.i32">,
		GCCBuiltin<"__builtin_ptx_shfl_idx_i32">;
		def int_ptx_shfl_idx_f32 :
		Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">,
		GCCBuiltin<"__builtin_ptx_shfl_idx_f32">;

llvm/trunk/lib/Target/NVPTX/NVPTXIntrinsics.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show All 24 Lines
def immDouble1 : PatLeaf<(fpimm), [{		def immDouble1 : PatLeaf<(fpimm), [{
double d = (double)N->getValueAPF().convertToDouble();		double d = (double)N->getValueAPF().convertToDouble();
return (d==1.0);		return (d==1.0);
}]>;		}]>;



//-----------------------------------		//-----------------------------------
// Synchronization Functions		// Synchronization and shuffle functions
//-----------------------------------		//-----------------------------------
let isConvergent = 1 in {		let isConvergent = 1 in {
def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),		def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
"bar.sync \t0;",		"bar.sync \t0;",
[(int_cuda_syncthreads)]>;		[(int_cuda_syncthreads)]>;
def INT_BARRIER0 : NVPTXInst<(outs), (ins),		def INT_BARRIER0 : NVPTXInst<(outs), (ins),
"bar.sync \t0;",		"bar.sync \t0;",
[(int_nvvm_barrier0)]>;		[(int_nvvm_barrier0)]>;
Show All 17 Lines	def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
!strconcat("{{ \n\t",		!strconcat("{{ \n\t",
!strconcat(".reg .pred \t%p1; \n\t",		!strconcat(".reg .pred \t%p1; \n\t",
!strconcat(".reg .pred \t%p2; \n\t",		!strconcat(".reg .pred \t%p2; \n\t",
!strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",		!strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
!strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",		!strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
!strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",		!strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
!strconcat("}}", ""))))))),		!strconcat("}}", ""))))))),
[(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;		[(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;

		// shfl.{up,down,bfly,idx}.b32
		multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
		// The last two parameters to shfl can be regs or imms. ptxas is smart
		// enough to inline constant registers, so strictly speaking we don't need to
		// handle immediates here. But it's easy enough, and it makes our ptx more
		// readable.
		def reg : NVPTXInst<
		(outs regclass:$dst),
		(ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
		!strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
		[(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;

		def imm1 : NVPTXInst<
		(outs regclass:$dst),
		(ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
		!strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
		[(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;

		def imm2 : NVPTXInst<
		(outs regclass:$dst),
		(ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
		!strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
		[(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;

		def imm3 : NVPTXInst<
		(outs regclass:$dst),
		(ins regclass:$src, i32imm:$offset, i32imm:$mask),
		!strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
		[(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
		}

		defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_ptx_shfl_down_i32>;
		defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_ptx_shfl_down_f32>;
		defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_ptx_shfl_up_i32>;
		defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_ptx_shfl_up_f32>;
		defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_ptx_shfl_bfly_i32>;
		defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_ptx_shfl_bfly_f32>;
		defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_ptx_shfl_idx_i32>;
		defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_ptx_shfl_idx_f32>;

} // isConvergent = 1		} // isConvergent = 1


//-----------------------------------		//-----------------------------------
// Explicit Memory Fence Functions		// Explicit Memory Fence Functions
//-----------------------------------		//-----------------------------------
class MEMBAR<string StrOp, Intrinsic IntOP> :		class MEMBAR<string StrOp, Intrinsic IntOP> :
NVPTXInst<(outs), (ins),		NVPTXInst<(outs), (ins),
▲ Show 20 Lines • Show All 6,990 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/NVPTX/shfl.ll

				; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -disable-nvptx-favor-non-generic \| FileCheck %s

				declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32)
				declare float @llvm.nvvm.shfl.down.f32(float, i32, i32)
				declare i32 @llvm.nvvm.shfl.up.i32(i32, i32, i32)
				declare float @llvm.nvvm.shfl.up.f32(float, i32, i32)
				declare i32 @llvm.nvvm.shfl.bfly.i32(i32, i32, i32)
				declare float @llvm.nvvm.shfl.bfly.f32(float, i32, i32)
				declare i32 @llvm.nvvm.shfl.idx.i32(i32, i32, i32)
				declare float @llvm.nvvm.shfl.idx.f32(float, i32, i32)

				; Try all four permutations of register and immediate parameters with
				; shfl.down.

				; CHECK-LABEL: .func{{.*}}shfl.down1
				define i32 @shfl.down1(i32 %in) {
				; CHECK: ld.param.u32 [[IN:%r[0-9]+]]
				; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]], [[IN]], 1, 2;
				; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
				%val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 1, i32 2)
				ret i32 %val
				}

				; CHECK-LABEL: .func{{.*}}shfl.down2
				define i32 @shfl.down2(i32 %in, i32 %width) {
				; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
				; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
				; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], [[IN2]], 3;
				%val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 %width, i32 3)
				ret i32 %val
				}

				; CHECK-LABEL: .func{{.*}}shfl.down3
				define i32 @shfl.down3(i32 %in, i32 %mask) {
				; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
				; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
				; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], 4, [[IN2]];
				%val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 4, i32 %mask)
				ret i32 %val
				}

				; CHECK-LABEL: .func{{.*}}shfl.down4
				define i32 @shfl.down4(i32 %in, i32 %width, i32 %mask) {
				; CHECK: ld.param.u32 [[IN1:%r[0-9]+]]
				; CHECK: ld.param.u32 [[IN2:%r[0-9]+]]
				; CHECK: ld.param.u32 [[IN3:%r[0-9]+]]
				; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], [[IN2]], [[IN3]];
				%val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 %width, i32 %mask)
				ret i32 %val
				}

				; Try shfl.down with floating-point params.
				; CHECK-LABEL: .func{{.*}}shfl.down.float
				define float @shfl.down.float(float %in) {
				; CHECK: ld.param.f32 [[IN:%f[0-9]+]]
				; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]], [[IN]], 5, 6;
				; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
				%out = call float @llvm.nvvm.shfl.down.f32(float %in, i32 5, i32 6)
				ret float %out
				}

				; Try the rest of the shfl modes. Hopefully they're declared in such a way
				; that if shfl.down works correctly, they also work correctly.
				define void @shfl.rest(i32 %in_i32, float %in_float, i32* %out_i32, float* %out_float) {
				; CHECK: shfl.up.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 1, 2;
				%up_i32 = call i32 @llvm.nvvm.shfl.up.i32(i32 %in_i32, i32 1, i32 2)
				store i32 %up_i32, i32* %out_i32

				; CHECK: shfl.up.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 3, 4;
				%up_float = call float @llvm.nvvm.shfl.up.f32(float %in_float, i32 3, i32 4)
				store float %up_float, float* %out_float

				; CHECK: shfl.bfly.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 5, 6;
				%bfly_i32 = call i32 @llvm.nvvm.shfl.bfly.i32(i32 %in_i32, i32 5, i32 6)
				store i32 %bfly_i32, i32* %out_i32

				; CHECK: shfl.bfly.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 7, 8;
				%bfly_float = call float @llvm.nvvm.shfl.bfly.f32(float %in_float, i32 7, i32 8)
				store float %bfly_float, float* %out_float

				; CHECK: shfl.idx.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 9, 10;
				%idx_i32 = call i32 @llvm.nvvm.shfl.idx.i32(i32 %in_i32, i32 9, i32 10)
				store i32 %idx_i32, i32* %out_i32

				; CHECK: shfl.idx.b32 %f{{[0-9]+}}, %f{{[0-9]+}}, 11, 12;
				%idx_float = call float @llvm.nvvm.shfl.idx.f32(float %in_float, i32 11, i32 12)
				store float %idx_float, float* %out_float

				ret void
				}