Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -206,12 +206,15 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>; } -def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">, +def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm32">, GISDNodeXFormEquiv; -def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">, +def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm16">, GISDNodeXFormEquiv; +def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">, + GISDNodeXFormEquiv; + def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">, GISDNodeXFormEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -174,8 +174,12 @@ void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; - void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI, - int OpIdx) const; + void renderTruncTImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderTruncTImm16(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2376,9 +2376,21 @@ /// This only really exists to satisfy DAG type checking machinery, so is a /// no-op here. -void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, - const MachineInstr &MI, - int OpIdx) const { +void AMDGPUInstructionSelector::renderTruncTImm32(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + MIB.addImm(MI.getOperand(OpIdx).getImm()); +} + +void AMDGPUInstructionSelector::renderTruncTImm16(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + MIB.addImm(MI.getOperand(OpIdx).getImm()); +} + +void AMDGPUInstructionSelector::renderTruncTImm1(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { MIB.addImm(MI.getOperand(OpIdx).getImm()); } Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -669,6 +669,10 @@ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); }]>; +def as_i1timm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); +}]>; + def as_i8imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8); }]>; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1907,9 +1907,9 @@ def : GCNPat < (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl), + (as_i32timm $row_mask), (as_i32timm $bank_mask), + (as_i1timm $bound_ctrl)) >; //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -843,11 +843,12 @@ >; def : GCNPat < - (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask, - timm:$bank_mask, timm:$bound_ctrl)), - (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl), - (as_i32imm $row_mask), (as_i32imm $bank_mask), - (as_i1imm $bound_ctrl)) + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, + timm:$row_mask, timm:$bank_mask, + timm:$bound_ctrl)), + (V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl), + (as_i32timm $row_mask), (as_i32timm $bank_mask), + (as_i1timm $bound_ctrl)) >; } // End OtherPredicates = [isGFX8Plus] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +; FIXME: Merge with DAG test + +define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { +; GFX8-LABEL: dpp_test: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: dpp_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) + store i32 %tmp0, i32 addrspace(1)* %out + ret void +} +define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) { +; GFX8-LABEL: update_dpp64_test: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, 8, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: update_dpp64_test: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, 8, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v[6:7], v[4:5], off +; GFX10-NEXT: s_endpgm + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id + %load = load i64, i64 addrspace(1)* %gep + %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #1 + store i64 %tmp0, i64 addrspace(1)* %gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1 +declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #1 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { convergent nounwind readnone }