diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1886,6 +1886,15 @@ LLVMMatchType<1>, llvm_v4i32_ty], [IntrReadMem, IntrWillReturn]>; +//===----------------------------------------------------------------------===// +// GFX11 Intrinsics +//===----------------------------------------------------------------------===// + +// llvm.amdgcn.permlane64 +def int_amdgcn_permlane64 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn]>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -311,6 +311,12 @@ if (ST->isWave32()) return V; + if (ST->hasPermLane64()) { + // Reduce across the upper and lower 32 lanes. + return buildNonAtomicBinOp( + B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V)); + } + // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. Function *ReadLane = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -919,6 +919,12 @@ return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); } + case Intrinsic::amdgcn_permlane64: + // A constant value is trivially uniform. + if (Constant *C = dyn_cast(II.getArgOperand(0))) { + return IC.replaceInstUsesWith(II, C); + } + break; case Intrinsic::amdgcn_readfirstlane: case Intrinsic::amdgcn_readlane: { // A constant value is trivially uniform. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4267,6 +4267,7 @@ case Intrinsic::amdgcn_wqm: case Intrinsic::amdgcn_softwqm: case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_permlane64: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_kernarg_segment_ptr: case Intrinsic::amdgcn_s_getpc: diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -518,7 +518,8 @@ let SubtargetPredicate = isGFX11Plus in { // Restrict src0 to be VGPR def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, - [], + getVOP1Pat64.ret, /*VOP1Only=*/ 1>; defm V_NOT_B16 : VOP1Inst<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst<"v_cvt_i32_i16", VOP_I32_I16>; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -969,23 +969,22 @@ ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1164-NEXT: v_readlane_b32 s3, v1, 32 +; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_add_i32 s0, s2, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1164-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v3, s0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_u32 v0, v3 +; GFX1164-NEXT: ds_add_u32 v3, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB3_2: @@ -2578,23 +2577,22 @@ ; GFX1164-NEXT: v_mov_b32_e32 v2, v1 ; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164-NEXT: v_permlane64_b32 v2, v1 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1164-NEXT: v_readlane_b32 s3, v1, 32 +; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_add_i32 s0, s2, s3 -; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1164-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v3, s0 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_u32 v0, v3 +; GFX1164-NEXT: ds_sub_u32 v3, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB10_2: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s + +declare i32 @llvm.amdgcn.permlane64(i32) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) { +; GFX11-LABEL: test_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) { +; GFX11-LABEL: test_i: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x63 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_permlane64_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.permlane64(i32 99) + store i32 %v, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @test_v(i32 addrspace(1)* %out, i32 %src0) #1 { +; GFX11-SDAG-LABEL: test_v: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: test_v: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm + %tidx = call i32 @llvm.amdgcn.workitem.id.x() + %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx) + store i32 %v, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/permlane64.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -instcombine -S | FileCheck %s + +; Optimize the intrinsic away if the argument is uniform. +define i32 @test_constant() { +; CHECK-LABEL: @test_constant( +; CHECK-NEXT: ret i32 99 +; + %call = call i32 @llvm.amdgcn.permlane64(i32 99) + ret i32 %call +} + +declare i32 @llvm.amdgcn.permlane64(i32)