diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -214,6 +214,8 @@ BUILTIN(__builtin_amdgcn_read_exec_lo, "Ui", "nc") BUILTIN(__builtin_amdgcn_read_exec_hi, "Ui", "nc") +BUILTIN(__builtin_amdgcn_endpgm, "v", "nr") + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu --- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -16,3 +16,9 @@ __shared__ float shared; volatile float x = __builtin_amdgcn_ds_fmaxf(&shared, src, 0, 0, false); } + +// CHECK-LABEL: @_Z6endpgmv( +// CHECK: call void @llvm.amdgcn.endpgm() +__global__ void endpgm() { + __builtin_amdgcn_endpgm(); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1577,6 +1577,10 @@ // FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; +def int_amdgcn_endpgm : GCCBuiltin<"__builtin_amdgcn_endpgm">, + Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects] +>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1118,6 +1118,7 @@ def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins EndpgmImm:$simm16), "$simm16"> { let isBarrier = 1; let isReturn = 1; + let hasSideEffects = 1; } def S_ENDPGM_SAVED : SOPP_Pseudo<"s_endpgm_saved", (ins)> { @@ -1328,6 +1329,11 @@ (S_ENDPGM (i16 0)) >; +def : GCNPat < + (int_amdgcn_endpgm), + (S_ENDPGM (i16 0)) +>; + def : GCNPat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_kernel void @test0() { +; CHECK-LABEL: test0: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_endpgm + tail call void @llvm.amdgcn.endpgm() + unreachable +} + +define void @test1() { +; CHECK-LABEL: test1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_endpgm + tail call void @llvm.amdgcn.endpgm() + unreachable +} + +define amdgpu_kernel void @test2(i32* %p, i32 %x) { +; CHECK-LABEL: test2: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lt_i32 s2, 1 +; CHECK-NEXT: s_cbranch_scc0 BB2_2 +; CHECK-NEXT: ; %bb.1: ; %else +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: flat_store_dword v[0:1], v2 +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB2_2: ; %then +; CHECK-NEXT: s_endpgm + %cond = icmp sgt i32 %x, 0 + br i1 %cond, label %then, label %else + +then: + tail call void @llvm.amdgcn.endpgm() + unreachable + +else: + store i32 %x, i32* %p + ret void +} + +declare void @llvm.amdgcn.endpgm()