diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4594,7 +4594,7 @@ return false; } if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && - ST.getGeneration() < AMDGPUSubtarget::GFX10) { + !ST.hasGFX10Insts()) { if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && DC <= DppCtrl::ROW_NEWBCAST_LAST && !ST.hasGFX90AInsts()) { @@ -7986,6 +7986,12 @@ if (MCOp == -1) return Opcode; + if (ST.hasDPP() && get(Opcode).TSFlags & SIInstrFlags::DPP) { + uint16_t NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::VI); + if (NMCOp != (uint16_t)-1) + MCOp = NMCOp; + } + if (ST.hasGFX90AInsts()) { uint16_t NMCOp = (uint16_t)-1; if (ST.hasGFX940Insts()) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1189,7 +1189,7 @@ } // End VOP1 = 1, SubtargetPredicate = isGFX8GFX9, Uses = [M0] -let OtherPredicates = [isGFX8Plus] in { +let OtherPredicates = [HasDPP] in { def : GCNPat < (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, @@ -1208,7 +1208,7 @@ (as_i1timm $bound_ctrl)) >; -} // End OtherPredicates = [isGFX8Plus] +} // End OtherPredicates = [HasDPP] let OtherPredicates = [isGFX8Plus] in { def : GCNPat< diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -1,11 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -mattr=+dpp -verify-machineinstrs < %s | FileCheck -check-prefix=PREGFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s +; Note: the Hawaii +dpp run line is to check that we can select this instruction +; only using the attribute, without needing an actual >=GFX8 GPU. + ; FIXME: Merge with DAG test define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) { +; PREGFX8-LABEL: dpp_test: +; PREGFX8: ; %bb.0: +; PREGFX8-NEXT: s_load_dword s3, s[0:1], 0xb +; PREGFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; PREGFX8-NEXT: s_mov_b32 s2, -1 +; PREGFX8-NEXT: s_waitcnt lgkmcnt(0) +; PREGFX8-NEXT: v_mov_b32_e32 v0, s3 +; PREGFX8-NEXT: s_nop 1 +; PREGFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 +; PREGFX8-NEXT: s_mov_b32 s3, 0xf000 +; PREGFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; PREGFX8-NEXT: s_endpgm +; ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c @@ -47,6 +64,19 @@ ret void } define amdgpu_kernel void @mov_dpp64_test(i64 addrspace(1)* %out, i64 %in1) { +; PREGFX8-LABEL: mov_dpp64_test: +; PREGFX8: ; %bb.0: +; PREGFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; PREGFX8-NEXT: s_waitcnt lgkmcnt(0) +; PREGFX8-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX8-NEXT: v_mov_b32_e32 v1, s3 +; PREGFX8-NEXT: v_mov_b32_e32 v3, s1 +; PREGFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; PREGFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; PREGFX8-NEXT: v_mov_b32_e32 v2, s0 +; PREGFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; PREGFX8-NEXT: s_endpgm +; ; GFX8-LABEL: mov_dpp64_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -1,9 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -mattr=+dpp -verify-machineinstrs < %s | FileCheck -check-prefix=PREGFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s +; Note: the Hawaii +dpp run line is to check that we can select this instruction +; only using the attribute, without needing an actual >=GFX8 GPU. + define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { +; PREGFX8-LABEL: dpp_test: +; PREGFX8: ; %bb.0: +; PREGFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; PREGFX8-NEXT: s_waitcnt lgkmcnt(0) +; PREGFX8-NEXT: v_mov_b32_e32 v0, s2 +; PREGFX8-NEXT: v_mov_b32_e32 v1, s3 +; PREGFX8-NEXT: s_mov_b32 s2, -1 +; PREGFX8-NEXT: s_mov_b32 s3, 0xf000 +; PREGFX8-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; PREGFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; PREGFX8-NEXT: s_endpgm +; ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -43,6 +59,24 @@ ret void } define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) { +; PREGFX8-LABEL: update_dpp64_test: +; PREGFX8: ; %bb.0: +; PREGFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; PREGFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; PREGFX8-NEXT: s_waitcnt lgkmcnt(0) +; PREGFX8-NEXT: v_mov_b32_e32 v0, s0 +; PREGFX8-NEXT: v_mov_b32_e32 v1, s1 +; PREGFX8-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; PREGFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; PREGFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; PREGFX8-NEXT: v_mov_b32_e32 v5, s3 +; PREGFX8-NEXT: v_mov_b32_e32 v4, s2 +; PREGFX8-NEXT: s_waitcnt vmcnt(0) +; PREGFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; PREGFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; PREGFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; PREGFX8-NEXT: s_endpgm +; ; GFX8-LABEL: update_dpp64_test: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -1,8 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -mattr=+dpp,-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT,PREGFX10,PREGFX10-OPT %s +; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -mattr=+dpp,-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-NOOPT,PREGFX10,PREGFX10-NOOPT %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT,PREGFX10,PREGFX10-OPT %s ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-NOOPT,PREGFX10,PREGFX10-NOOPT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s +; Note: the Hawaii +dpp run line is to check that we can select this instruction +; only using the attribute, without needing an actual >=GFX8 GPU. + ; FIXME: The register allocator / scheduler should be able to avoid these hazards. ; VI-LABEL: {{^}}dpp_test: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -1,8 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -mattr=+dpp,-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s +; RUN: llc -march=amdgcn -mcpu=hawaii -O0 -mattr=+dpp,-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s ; RUN: llc -march=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s +; Note: the Hawaii +dpp run line is to check that we can select this instruction +; only using the attribute, without needing an actual >=GFX8 GPU. + ; GCN-LABEL: {{^}}dpp_test: ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} @@ -32,8 +37,8 @@ ; GCN-LABEL: {{^}}dpp_test1: ; GFX10,GFX11: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} -; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8-OPT: v_add_{{u|i}}32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8-NOOPT: v_add_{{u|i}}32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 ; GFX8: s_nop 1 ; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf