diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1925,6 +1925,16 @@ Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn]>; +def int_amdgcn_ds_add_gs_reg_rtn : + GCCBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, + Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], + [ImmArg>, IntrHasSideEffects, IntrWillReturn]>; + +def int_amdgcn_ds_sub_gs_reg_rtn : + GCCBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, + Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], + [ImmArg>, IntrHasSideEffects, IntrWillReturn]>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4673,6 +4673,11 @@ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); break; } + case Intrinsic::amdgcn_ds_add_gs_reg_rtn: + case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: + OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1112,6 +1112,34 @@ (DS_ORDERED_COUNT $value, (as_i16imm $offset)) >; +def : GCNPat < + (i64 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)), + (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)) +>; + +def : GCNPat < + (i32 (int_amdgcn_ds_add_gs_reg_rtn i32:$src, timm:$offset32)), + (EXTRACT_SUBREG + (i64 (COPY_TO_REGCLASS + (DS_ADD_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)), + VReg_64)), + sub0) +>; + +def : GCNPat < + (i64 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)), + (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)) +>; + +def : GCNPat < + (i32 (int_amdgcn_ds_sub_gs_reg_rtn i32:$src, timm:$offset32)), + (EXTRACT_SUBREG + (i64 (COPY_TO_REGCLASS + (DS_SUB_GS_REG_RTN VGPR_32:$src, (as_i32timm $offset32)), + VReg_64)), + sub0) +>; + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s + +declare i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32, i32 immarg) +declare i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32, i32 immarg) + +define amdgpu_gs void @test_add_32(i32 %arg) { +; CHECK-LABEL: test_add_32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: buffer_gl0_inv +; CHECK-NEXT: buffer_gl1_inv +; CHECK-NEXT: s_endpgm + %unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16) + ret void +} + +define amdgpu_gs void @test_add_32_use(i32 %arg, i32 addrspace(1)* %out) { +; CHECK-LABEL: test_add_32_use: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:16 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: buffer_gl0_inv +; CHECK-NEXT: buffer_gl1_inv +; CHECK-NEXT: global_store_b32 v[1:2], v3, off +; CHECK-NEXT: s_endpgm + %res = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16) + store i32 %res, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_add_64(i32 %arg) { +; CHECK-LABEL: test_add_64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: buffer_gl0_inv +; CHECK-NEXT: buffer_gl1_inv +; CHECK-NEXT: s_endpgm + %unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32) + ret void +} + +define amdgpu_gs void @test_add_64_use(i32 %arg, i64 addrspace(1)* %out) { +; CHECK-LABEL: test_add_64_use: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:32 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: buffer_gl0_inv +; CHECK-NEXT: buffer_gl1_inv +; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off +; CHECK-NEXT: s_endpgm + %res = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32) + store i64 %res, i64 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s + +declare i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32, i32 immarg) +declare i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32, i32 immarg) + +define amdgpu_gs void @test_sub_32(i32 %arg) { +; CHECK-LABEL: test_sub_32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: buffer_gl0_inv +; CHECK-NEXT: buffer_gl1_inv +; CHECK-NEXT: s_endpgm + %unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16) + ret void +} + +define amdgpu_gs void @test_sub_32_use(i32 %arg, i32 addrspace(1)* %out) { +; CHECK-LABEL: test_sub_32_use: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:16 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: buffer_gl0_inv +; CHECK-NEXT: buffer_gl1_inv +; CHECK-NEXT: global_store_b32 v[1:2], v3, off +; CHECK-NEXT: s_endpgm + %res = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16) + store i32 %res, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_gs void @test_sub_64(i32 %arg) { +; CHECK-LABEL: test_sub_64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: buffer_gl0_inv +; CHECK-NEXT: buffer_gl1_inv +; CHECK-NEXT: s_endpgm + %unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32) + ret void +} + +define amdgpu_gs void @test_sub_64_use(i32 %arg, i64 addrspace(1)* %out) { +; CHECK-LABEL: test_sub_64_use: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:32 gds +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: buffer_gl0_inv +; CHECK-NEXT: buffer_gl1_inv +; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off +; CHECK-NEXT: s_endpgm + %res = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32) + store i64 %res, i64 addrspace(1)* %out, align 4 + ret void +} +