diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1328,7 +1328,21 @@ IntrWillReturn] >; -// exp with compr bit set. +// exp with row_en bit set. Only supported on GFX11+. +def int_amdgcn_exp_row : Intrinsic <[], [ + llvm_i32_ty, // tgt, + llvm_i32_ty, // en + llvm_any_ty, // src0 (f32 or i32) + LLVMMatchType<0>, // src1 + LLVMMatchType<0>, // src2 + LLVMMatchType<0>, // src3 + llvm_i1_ty, // done + llvm_i32_ty], // row number + [ImmArg>, ImmArg>, ImmArg>, + IntrWriteMem, IntrInaccessibleMemOnly, IntrWillReturn] +>; + +// exp with compr bit set. Not supported on GFX11+. def int_amdgcn_exp_compr : Intrinsic <[], [ llvm_i32_ty, // tgt, llvm_i32_ty, // en diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -621,6 +621,7 @@ return IC.replaceInstUsesWith(II, RightShift); } case Intrinsic::amdgcn_exp: + case Intrinsic::amdgcn_exp_row: case Intrinsic::amdgcn_exp_compr: { ConstantInt *En = cast(II.getArgOperand(1)); unsigned EnBits = En->getZExtValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4535,6 +4535,7 @@ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); break; case Intrinsic::amdgcn_exp: + case Intrinsic::amdgcn_exp_row: // FIXME: Could we support packed types here? OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td b/llvm/lib/Target/AMDGPU/EXPInstructions.td --- a/llvm/lib/Target/AMDGPU/EXPInstructions.td +++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td @@ -134,6 +134,15 @@ ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en) >; +class ExpRowPattern : GCNPat< + (int_amdgcn_exp_row timm:$tgt, timm:$en, + (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), + (vt ExpSrc2:$src2), (vt ExpSrc3:$src3), + done_val, M0), + (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, + ExpSrc2:$src2, ExpSrc3:$src3, 0, 0, timm:$en) +>; + class ExpComprPattern : GCNPat< (int_amdgcn_exp_compr timm:$tgt, timm:$en, (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), @@ -150,6 +159,11 @@ def : ExpPattern; def : ExpPattern; +def : ExpRowPattern; +def : ExpRowPattern; +def : ExpRowPattern; +def : ExpRowPattern; + def : ExpComprPattern; def : ExpComprPattern; def : ExpComprPattern; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s + +declare void @llvm.amdgcn.exp.row.i32(i32, i32, i32, i32, i32, i32, i1, i32) +declare void @llvm.amdgcn.exp.row.f32(i32, i32, float, float, float, float, i1, i32) +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @undef_i32() #0 { +; CHECK-LABEL: undef_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 m0, 0 +; CHECK-NEXT: exp pos0 off, off, off, off row_en +; CHECK-NEXT: exp pos1 off, off, off, off done row_en +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.exp.row.i32(i32 12, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i1 false, i32 0) + call void @llvm.amdgcn.exp.row.i32(i32 13, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i1 true, i32 0) + ret void +} + +define amdgpu_kernel void @undef_f32() #0 { +; CHECK-LABEL: undef_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 m0, 0 +; CHECK-NEXT: exp pos0 off, off, off, off row_en +; CHECK-NEXT: exp pos1 off, off, off, off done row_en +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.exp.row.f32(i32 12, i32 0, float undef, float undef, float undef, float undef, i1 false, i32 0) + call void @llvm.amdgcn.exp.row.f32(i32 13, i32 0, float undef, float undef, float undef, float undef, i1 true, i32 0) + ret void +} + +; FIXME: no need for readfirstlane here +define amdgpu_kernel void @zero_i32() #0 { +; CHECK-LABEL: zero_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 m0, 0 +; CHECK-NEXT: exp pos0 v0, v0, v0, off row_en +; CHECK-NEXT: exp pos1 v0, v0, v0, off done row_en +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.exp.row.i32(i32 12, i32 7, i32 0, i32 0, i32 0, i32 undef, i1 false, i32 0) + call void @llvm.amdgcn.exp.row.i32(i32 13, i32 7, i32 0, i32 0, i32 0, i32 undef, i1 true, i32 0) + ret void +} + +define amdgpu_kernel void @one_f32() #0 { +; CHECK-LABEL: one_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_mov_b32_e32 v0, 1.0 +; CHECK-NEXT: s_mov_b32 m0, 0 +; CHECK-NEXT: exp pos0 v0, v0, v0, off row_en +; CHECK-NEXT: exp pos1 v0, v0, v0, off done row_en +; CHECK-NEXT: s_endpgm + call void @llvm.amdgcn.exp.row.f32(i32 12, i32 7, float 1.0, float 1.0, float 1.0, float undef, i1 false, i32 0) + call void @llvm.amdgcn.exp.row.f32(i32 13, i32 7, float 1.0, float 1.0, float 1.0, float undef, i1 true, i32 0) + ret void +} + +define amdgpu_kernel void @id_i32() #0 { +; CHECK-LABEL: id_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 m0, 0 +; CHECK-NEXT: exp pos0 v0, off, off, off done row_en +; CHECK-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 %id, i32 undef, i32 undef, i32 undef, i1 true, i32 0) + ret void +} + +define amdgpu_kernel void @id_arg_i32(i32 %row) #0 { +; CHECK-LABEL: id_arg_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_b32 s0, s[0:1], 0x24 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 m0, s0 +; CHECK-NEXT: exp pos0 v0, off, off, off done row_en +; CHECK-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 %id, i32 undef, i32 undef, i32 undef, i1 true, i32 %row) + ret void +} + +; Divergent row number just causes a readfirstlane for now. +define amdgpu_kernel void @id_row_i32() #0 { +; CHECK-LABEL: id_row_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x63 +; CHECK-NEXT: s_mov_b32 m0, s0 +; CHECK-NEXT: exp pos0 v0, off, off, off done row_en +; CHECK-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + call void @llvm.amdgcn.exp.row.i32(i32 12, i32 1, i32 99, i32 undef, i32 undef, i32 undef, i1 true, i32 %id) + ret void +}