diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -894,6 +894,8 @@ bool hasMovB64() const { return GFX940Insts; } + bool hasLshlAddB64() const { return GFX940Insts; } + bool enableSIScheduler() const { return EnableSIScheduler; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4071,6 +4071,21 @@ bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(2); + + if (IsAdd && ST.hasLshlAddB64()) { + auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), + Dest.getReg()) + .add(Src0) + .addImm(0) + .add(Src1); + TII->legalizeOperands(*Add); + MI.eraseFromParent(); + return BB; + } + const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -4079,10 +4094,6 @@ Register CarryReg = MRI.createVirtualRegister(CarryRC); Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); - MachineOperand &Dest = MI.getOperand(0); - MachineOperand &Src0 = MI.getOperand(1); - MachineOperand &Src1 = MI.getOperand(2); - const TargetRegisterClass *Src0RC = Src0.isReg() ? MRI.getRegClass(Src0.getReg()) : &AMDGPU::VReg_64RegClass; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -609,6 +609,23 @@ }]; } +def shl_0_to_4 : PatFrag< + (ops node:$src0, node:$src1), (shl node:$src0, node:$src1), + [{ + if (auto *C = dyn_cast(N->getOperand(1))) { + return C->getZExtValue() <= 4; + } + return false; + }]> { + let GISelPredicateCode = [{ + int64_t Imm = 0; + if (!mi_match(MI.getOperand(2).getReg(), MRI, m_ICst(Imm)) && + !mi_match(MI.getOperand(2).getReg(), MRI, m_Copy(m_ICst(Imm)))) + return false; + return (uint64_t)Imm <= 4; + }]; +} + let SubtargetPredicate = isGFX9Plus in { let isCommutable = 1, isReMaterializable = 1 in { defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile>; @@ -649,6 +666,10 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile>; } // End isReMaterializable = 1 +// V_LSHL_ADD_U64: D0.u64 = (S0.u64 << S1.u[2:0]) + S2.u64 +// src0 is shifted left by 0-4 (use “0” to get ADD_U64). +let SubtargetPredicate = isGFX940Plus in +defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile>; class ThreeOp_i32_Pats : GCNPat < // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. @@ -664,6 +685,12 @@ def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; +let SubtargetPredicate = isGFX940Plus in +def : GCNPat< + (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) +>; + def : VOPBinOpClampPat; def : VOPBinOpClampPat; @@ -1273,3 +1300,5 @@ defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>; defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>; + +defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>; diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll @@ -0,0 +1,108 @@ +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) { +; GCN-LABEL: lshl_add_u64_v1v: +; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1, v[{{[0-9:]+}}] + %shl = shl i64 %v, 1 + %add = add i64 %shl, %a + ret i64 %add +} + +define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) { +; GCN-LABEL: lshl_add_u64_v4v: +; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4, v[{{[0-9:]+}}] + %shl = shl i64 %v, 4 + %add = add i64 %shl, %a + ret i64 %add +} + +define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) { +; GCN-LABEL: lshl_add_u64_v5v: +; GCN: v_lshlrev_b64 +; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}] + %shl = shl i64 %v, 5 + %add = add i64 %shl, %a + ret i64 %add +} + +define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) { +; GCN-LABEL: lshl_add_u64_vvv: +; GCN: v_lshlrev_b64 +; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}] + %shl = shl i64 %v, %s + %add = add i64 %shl, %a + ret i64 %add +} + +define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) { +; GCN-LABEL: lshl_add_u64_s2v: +; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}] + %a = load i64, i64* undef + %shl = shl i64 %v, 2 + %add = add i64 %shl, %a + store i64 %add, i64* undef + ret void +} + +define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) { +; GCN-LABEL: lshl_add_u64_v2s: +; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}] + %v = load i64, i64* undef + %shl = shl i64 %v, 2 + %add = add i64 %shl, %a + store i64 %add, i64* undef + ret void +} + +define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) { +; GCN-LABEL: lshl_add_u64_s2s: +; GCN: s_lshl_b64 +; GCN: s_add_u32 +; GCN: s_addc_u32 + %shl = shl i64 %v, 2 + %add = add i64 %shl, %a + store i64 %add, i64* undef + ret void +} + +define i64 @add_u64_vv(i64 %v, i64 %a) { +; GCN-LABEL: add_u64_vv: +; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] + %add = add i64 %v, %a + ret i64 %add +} + +define amdgpu_kernel void @add_u64_sv(i64 %v) { +; GCN-LABEL: add_u64_sv: +; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] + %a = load i64, i64* undef + %add = add i64 %v, %a + store i64 %add, i64* undef + ret void +} + +define amdgpu_kernel void @add_u64_vs(i64 %a) { +; GCN-LABEL: add_u64_vs: +; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] + %v = load i64, i64* undef + %add = add i64 %v, %a + store i64 %add, i64* undef + ret void +} + +define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) { +; GCN-LABEL: add_u64_ss: +; GCN: s_add_u32 +; GCN: s_addc_u32 s1, s1, s3 + %add = add i64 %v, %a + store i64 %add, i64* undef + ret void +} + +define i32 @lshl_add_u64_gep(i32 *%p, i64 %a) { +; GCN-LABEL: lshl_add_u64_gep: +; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] + %gep = getelementptr inbounds i32, i32* %p, i64 %a + %v = load i32, i32* %gep + ret i32 %v +} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -212,10 +212,9 @@ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 @@ -227,10 +226,9 @@ ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX940-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 @@ -450,13 +448,12 @@ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt ; GFX940-NOTTGSPLIT-NEXT: s_endpgm @@ -465,13 +462,12 @@ ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX940-TGSPLIT-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt ; GFX940-TGSPLIT-NEXT: s_endpgm diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s --- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s @@ -149,6 +149,22 @@ // GFX940: v_mov_b64_e32 v[2:3], 0x64 ; encoding: [0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00] v_mov_b64 v[2:3], 0x64 +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: v_lshl_add_u64 v[2:3], s[4:5], v7, v[8:9] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x0e,0x22,0x04] +v_lshl_add_u64 v[2:3], s[4:5], v7, v[8:9] + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: v_lshl_add_u64 v[2:3], v[4:5], 0, 1 ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x01,0x05,0x02] +v_lshl_add_u64 v[2:3], v[4:5], 0, 1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: v_lshl_add_u64 v[2:3], v[4:5], 3, s[2:3] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x07,0x09,0x00] +v_lshl_add_u64 v[2:3], v[4:5], 3, s[2:3] + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x08,0x09,0x04] +v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3] + // GFX90A: error: invalid operand for instruction // GFX10: error: instruction not supported on this GPU // GFX940: buffer_wbl2 sc1 ; encoding: [0x00,0x80,0xa0,0xe0,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt @@ -102,6 +102,18 @@ # GFX940: v_mov_b64_e32 v[2:3], 0x64 ; encoding: [0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00] 0xff,0x70,0x04,0x7e,0x64,0x00,0x00,0x00 +# GFX940: v_lshl_add_u64 v[2:3], s[4:5], v7, v[8:9] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x0e,0x22,0x04] +0x02,0x00,0x08,0xd2,0x04,0x0e,0x22,0x04 + +# GFX940: v_lshl_add_u64 v[2:3], v[4:5], 0, 1 ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x01,0x05,0x02] +0x02,0x00,0x08,0xd2,0x04,0x01,0x05,0x02 + +# GFX940: v_lshl_add_u64 v[2:3], v[4:5], 3, s[2:3] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x07,0x09,0x00] +0x02,0x00,0x08,0xd2,0x04,0x07,0x09,0x00 + +# GFX940: v_lshl_add_u64 v[2:3], s[4:5], 4, v[2:3] ; encoding: [0x02,0x00,0x08,0xd2,0x04,0x08,0x09,0x04] +0x02,0x00,0x08,0xd2,0x04,0x08,0x09,0x04 + # GFX940: buffer_wbl2 sc1 ; encoding: [0x00,0x80,0xa0,0xe0,0x00,0x00,0x00,0x00] 0x00,0x80,0xa0,0xe0,0x00,0x00,0x00,0x00