diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -98,7 +98,7 @@ } private: - int getDPPOp(unsigned Op, bool IsShrinkable) const; + int getDPPOp(unsigned Op, bool HasVOP3DPP, bool IsShrinkable) const; bool isShrinkable(MachineInstr &MI) const; }; @@ -142,14 +142,22 @@ return true; } -int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const { +int GCNDPPCombine::getDPPOp(unsigned Op, bool HasVOP3DPP, + bool IsShrinkable) const { auto DPP32 = AMDGPU::getDPPOp32(Op); + auto DPP64 = AMDGPU::getDPPOp64(Op); if (IsShrinkable) { assert(DPP32 == -1); auto E32 = AMDGPU::getVOPe32(Op); DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32); } - return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32; + if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1) { + return DPP32; + } + if (HasVOP3DPP && DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1) { + return DPP64; + } + return -1; } // tracks the register operand definition and returns: @@ -188,8 +196,9 @@ MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp || MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); + bool HasVOP3DPP = ST->hasVOP3DPP(); auto OrigOp = OrigMI.getOpcode(); - auto DPPOp = getDPPOp(OrigOp, IsShrinkable); + auto DPPOp = getDPPOp(OrigOp, HasVOP3DPP, IsShrinkable); if (DPPOp == -1) { LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); return nullptr; @@ -201,10 +210,18 @@ bool Fail = false; do { - auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); - assert(Dst); - DPPInst.add(*Dst); - int NumOperands = 1; + int NumOperands = 0; + if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) { + DPPInst.add(*Dst); + ++NumOperands; + } + if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) { + if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) { + DPPInst.add(*SDst); + ++NumOperands; + } + // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst + } const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); if (OldIdx != -1) { @@ -230,7 +247,8 @@ AMDGPU::OpName::src0_modifiers)) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src0_modifiers)); - assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + assert(HasVOP3DPP || + (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); DPPInst.addImm(Mod0->getImm()); ++NumOperands; } else if (AMDGPU::getNamedOperandIdx(DPPOp, @@ -253,7 +271,8 @@ AMDGPU::OpName::src1_modifiers)) { assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src1_modifiers)); - assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + assert(HasVOP3DPP || + (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); DPPInst.addImm(Mod1->getImm()); ++NumOperands; } else if (AMDGPU::getNamedOperandIdx(DPPOp, @@ -261,7 +280,8 @@ DPPInst.addImm(0); ++NumOperands; } - if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + if (Src1) { if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); Fail = true; @@ -270,8 +290,17 @@ DPPInst.add(*Src1); ++NumOperands; } - - if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { + if (auto *Mod2 = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) { + assert(NumOperands == + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers)); + assert(HasVOP3DPP || + (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)))); + DPPInst.addImm(Mod2->getImm()); + ++NumOperands; + } + auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); + if (Src2) { if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) || !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); @@ -279,8 +308,62 @@ break; } DPPInst.add(*Src2); + ++NumOperands; + } + if (HasVOP3DPP) { + auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp); + if (ClampOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::clamp) != -1) { + DPPInst.addImm(ClampOpr->getImm()); + } + auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in); + if (VdstInOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::vdst_in) != -1) { + DPPInst.add(*VdstInOpr); + } + auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod); + if (OmodOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::omod) != -1) { + DPPInst.addImm(OmodOpr->getImm()); + } + // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to + // all 1. + if (auto *OpSelOpr = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) { + auto OpSel = OpSelOpr->getImm(); + if (OpSel != 0) { + LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n"); + Fail = true; + break; + } + if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel) != -1) + DPPInst.addImm(OpSel); + } + if (auto *OpSelHiOpr = + TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) { + auto OpSelHi = OpSelHiOpr->getImm(); + // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check + // the bitmask for 3 op_sel_hi bits set + assert(Src2 && "Expected vop3p with 3 operands"); + if (OpSelHi != 7) { + LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n"); + Fail = true; + break; + } + if (AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::op_sel_hi) != -1) + DPPInst.addImm(OpSelHi); + } + auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo); + if (NegOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_lo) != -1) { + DPPInst.addImm(NegOpr->getImm()); + } + auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi); + if (NegHiOpr && + AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::neg_hi) != -1) { + DPPInst.addImm(NegHiOpr->getImm()); + } } - DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); @@ -531,8 +614,16 @@ } bool IsShrinkable = isShrinkable(OrigMI); - if (!(IsShrinkable || TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { - LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); + if (!(IsShrinkable || + ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) || + TII->isVOP3(OrigOp)) && + ST->hasVOP3DPP()) || + TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) { + LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n"); + break; + } + if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) { + LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n"); break; } @@ -543,9 +634,12 @@ break; } + auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2); assert(Src0 && "Src1 without Src0?"); - if (Src1 && Src1->isIdenticalTo(*Src0)) { - assert(Src1->isReg()); + if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) || + (Src2 && Src2->isIdenticalTo(*Src0)))) || + (Use == Src1 && (Src1->isIdenticalTo(*Src0) || + (Src2 && Src2->isIdenticalTo(*Src1))))) { LLVM_DEBUG( dbgs() << " " << OrigMI diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1233,6 +1233,9 @@ LLVM_READONLY int getDPPOp32(uint16_t Opcode); + LLVM_READONLY + int getDPPOp64(uint16_t Opcode); + LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2182,21 +2182,21 @@ "$sdst", "$vdst"), ""); // use $sdst for VOPC - string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); - string isrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1", - " $src1,")); - string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); - - string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); - string fsrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1_modifiers", - " $src1_modifiers,")); - string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); - - string src0 = !if(Src0HasMods, fsrc0, isrc0); - string src1 = !if(Src1HasMods, fsrc1, isrc1); - string src2 = !if(Src2HasMods, fsrc2, isrc2); + string src0nomods = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string src1nomods = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string src2nomods = !if(!eq(NumSrcArgs, 3), " $src2", ""); + + string src0mods = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1mods = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2mods = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + + string src0 = !if(Src0HasMods, src0mods, src0nomods); + string src1 = !if(Src1HasMods, src1mods, src1nomods); + string src2 = !if(Src2HasMods, src2mods, src2nomods); string opsel = !if(HasOpSel, "$op_sel", ""); string 3PMods = !if(IsVOP3P, !if(HasOpSel, "$op_sel_hi", "") @@ -2546,8 +2546,8 @@ // the asm operand name via this HasModifiers flag field string AsmDPP8 = getAsmDPP8.ret; field string AsmVOP3DPPBase = getAsmVOP3DPPBase.ret; + HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers, + HasModifiers, DstVT>.ret; field string AsmVOP3DPP = getAsmVOP3DPP.ret; field string AsmVOP3DPP16 = getAsmVOP3DPP16.ret; field string AsmVOP3DPP8 = getAsmVOP3DPP8.ret; @@ -2787,6 +2787,14 @@ let ValueCols = [["DPP"]]; } +def getDPPOp64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["VOP3"]; + let ValueCols = [["VOP3_DPP"]]; +} + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1210,7 +1210,9 @@ let IsMAI = !if(Features.IsMAI, 1, P.IsMAI); let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); - let HasModifiers = !if(Features.IsMAI, 0, !or(Features.IsPacked, P.HasModifiers)); + let HasModifiers = + !if (Features.IsMAI, 0, + !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers)); } class VOP3_Profile : VOP3_Profile_Base { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s ; FIXME: Merge with DAG test @@ -29,6 +30,18 @@ ; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x02,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: dpp_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01] +; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0 store i32 %tmp0, i32 addrspace(1)* %out ret void @@ -58,6 +71,18 @@ ; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: mov_dpp64_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00] +; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11] +; GFX11-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0 store i64 %tmp0, i64 addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: @@ -29,6 +30,19 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dpp_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) store i32 %tmp0, i32 addrspace(1)* %out ret void @@ -65,6 +79,20 @@ ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: update_dpp64_test: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id %load = load i64, i64 addrspace(1)* %gep diff --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll --- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll +++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll @@ -1,9 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64 -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS ; GCN-LABEL: {{^}}dpp64_ceil: -; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], +; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]], ; DPP64: v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} ; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) { @@ -19,7 +20,7 @@ } ; GCN-LABEL: {{^}}dpp64_rcp: -; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], +; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]], ; DPP64: v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} ; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp64_rcp(i64 addrspace(1)* %arg, i64 %in1) { @@ -50,12 +51,12 @@ } ; GCN-LABEL: {{^}}dpp64_div: -; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], -; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} -; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} -; GFX10-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} -; GCN: v_div_scale_f64 -; GCN: v_rcp_f64_e32 +; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]], +; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; GFX10PLUS-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; GCN: v_div_scale_f64 +; GCN: v_rcp_f64_e32 define amdgpu_kernel void @dpp64_div(i64 addrspace(1)* %arg, i64 %in1) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll @@ -1,8 +1,9 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN ; GCN-LABEL: {{^}}dpp_add: -; GCN: global_load_dword [[V:v[0-9]+]], +; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], ; GCN: v_add_{{(nc_)?}}u32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_add(i32 addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -15,7 +16,7 @@ } ; GCN-LABEL: {{^}}dpp_ceil: -; GCN: global_load_dword [[V:v[0-9]+]], +; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], ; GCN: v_ceil_f32_dpp [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_ceil(i32 addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -30,7 +31,7 @@ } ; GCN-LABEL: {{^}}dpp_fadd: -; GCN: global_load_dword [[V:v[0-9]+]], +; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], ; GCN: v_add_f32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_fadd(i32 addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir @@ -0,0 +1,810 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN + +--- + +# GCN-label: name: vop3 +# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec +# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec +# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec +name: vop3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec + + %5:sreg_32_xm0_xexec = IMPLICIT_DEF + %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64 %4, %1, %5, 1, implicit $exec + + %8:vgpr_32 = V_CVT_PK_U8_F32_e64 4, %4, 2, %2, 2, %1, 1, implicit $mode, implicit $exec + + ; should not be combined because src2 literal is illegal + %9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec + + ; should not be combined because src1 imm is illegal + %11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec +... + +# Regression test for src_modifiers on base u16 opcode +# GCN-label: name: vop3_u16 +# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec +# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 4, %5, 8, %5, 0, 0, 1, 15, 15, 1, implicit $exec +name: vop3_u16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec + %6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec + %7:vgpr_32 = V_ADD_NC_U16_e64 4, %6, 8, %5, 0, 0, implicit $exec +... + +name: vop3p +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GCN-LABEL: name: vop3p + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec + ; GCN: [[V_DOT2_F32_F16_:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp]], 0, [[COPY]], 0, [[COPY2]], 0, 5, 0, 0, 0, implicit $mode, implicit $exec + ; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec + ; GCN: [[V_DOT2_F32_F16_1:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, 4, 0, 0, implicit $mode, implicit $exec + ; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 13, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec + ; GCN: [[V_FMA_MIX_F32_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec + ; GCN: [[V_FMA_MIXLO_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 0, [[COPY2]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec + ; GCN: [[V_FMA_MIXHI_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, [[COPY]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = IMPLICIT_DEF + + ; this should not be combined because op_sel is not zero + %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %5:vgpr_32 = V_DOT2_F32_F16 0, %4, 0, %0, 0, %2, 0, 5, 0, 0, 0, implicit $mode, implicit $exec + + ; this should not be combined because op_sel_hi is not all set + %6:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %7:vgpr_32 = V_DOT2_F32_F16 0, %6, 0, %0, 0, %2, 0, 0, 4, 0, 0, implicit $mode, implicit $exec + + %8:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 13, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec + + %10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %11:vgpr_32 = V_FMA_MIX_F32 8, %10, 8, %0, 8, %2, 1, 0, 7, implicit $mode, implicit $exec + + %12:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %13:vgpr_32 = V_FMA_MIXLO_F16 8, %12, 8, %0, 8, %2, 0, %2, 0, 7, implicit $mode, implicit $exec + + %14:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %15:vgpr_32 = V_FMA_MIXHI_F16 8, %14, 8, %0, 8, %2, 1, %0, 0, 7, implicit $mode, implicit $exec + +... + +# when the DPP source isn't a src0 operand the operation should be commuted if possible +# GCN-LABEL: name: dpp_commute_shrink +# GCN: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec +# GCN: %7:vgpr_32 = V_AND_B32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec +# GCN: %10:vgpr_32 = V_MAX_I32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec +# GCN: %13:vgpr_32 = V_MIN_I32_dpp %1, %0, %1, 1, 15, 14, 0, implicit $exec +# GCN: %16:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec +name: dpp_commute_shrink +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec + %4:vgpr_32 = V_MUL_U32_U24_e64 %1, %3, 0, implicit $exec + + %5:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + %6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 14, 0, implicit $exec + %7:vgpr_32 = V_AND_B32_e64 %1, %6, implicit $exec + + %8:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + %9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 0, implicit $exec + %10:vgpr_32 = V_MAX_I32_e64 %1, %9, implicit $exec + + %11:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec + %12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 15, 14, 0, implicit $exec + %13:vgpr_32 = V_MIN_I32_e64 %1, %12, implicit $exec + + %14:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %15:vgpr_32 = V_MOV_B32_dpp %14, %0, 1, 14, 15, 0, implicit $exec + %16:vgpr_32 = V_SUB_U32_e64 %1, %15, 0, implicit $exec + +... + +# do not combine, dpp arg used twice +# GCN-label: name: dpp_arg_twice +# GCN: %4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec +# GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec +# GCN: %8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec +name: dpp_arg_twice +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %3, 2, %3, 1, 2, implicit $mode, implicit $exec + + %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %6:vgpr_32 = V_FMA_F32_e64 2, %5, 2, %1, 2, %5, 1, 2, implicit $mode, implicit $exec + + %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %8:vgpr_32 = V_FMA_F32_e64 2, %7, 2, %7, 2, %1, 1, 2, implicit $mode, implicit $exec + +... + +# when the dpp source isn't a src0 operand the operation should be commuted if possible +# GCN-label: name: dpp_commute_e64 +# GCN: %4:vgpr_32 = V_MUL_U32_U24_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec +# GCN: %7:vgpr_32 = V_FMA_F32_e64_dpp %5, 2, %0, 1, %1, 2, %1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %10:vgpr_32 = V_SUBREV_U32_e64_dpp %1, %0, %1, 1, 1, 14, 15, 0, implicit $exec +# GCN: %13:vgpr_32, %14:sreg_32_xm0_xexec = V_ADD_CO_U32_e64_dpp %1, %0, %1, 0, 1, 14, 15, 0, implicit $exec +# GCN: %17:vgpr_32, %18:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 5, %16, 0, implicit $exec +name: dpp_commute_e64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec + %4:vgpr_32 = V_MUL_U32_U24_e64 %1, %3, 1, implicit $exec + + %5:vgpr_32 = IMPLICIT_DEF + %6:vgpr_32 = V_MOV_B32_dpp %5, %0, 1, 15, 15, 1, implicit $exec + %7:vgpr_32 = V_FMA_F32_e64 1, %1, 2, %6, 2, %1, 1, 2, implicit $mode, implicit $exec + + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vgpr_32 = V_MOV_B32_dpp %8, %0, 1, 14, 15, 0, implicit $exec + %10:vgpr_32 = V_SUB_U32_e64 %1, %9, 1, implicit $exec + + %11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %12:vgpr_32 = V_MOV_B32_dpp %11, %0, 1, 14, 15, 0, implicit $exec + %13:vgpr_32, %14:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %1, %12, 0, implicit $exec + + ; this cannot be combined because immediate as src0 isn't commutable + %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %16:vgpr_32 = V_MOV_B32_dpp %15, %0, 1, 14, 15, 0, implicit $exec + %17:vgpr_32, %18:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 5, %16, 0, implicit $exec +... + +--- + +# check for floating point modifiers +# GCN-LABEL: name: add_f32_e64 +# GCN: %4:vgpr_32 = V_ADD_F32_e64_dpp %2, 0, %1, 0, %0, 0, 1, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %6:vgpr_32 = V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %8:vgpr_32 = V_ADD_F32_dpp %2, 1, %1, 2, %0, 1, 15, 15, 1, implicit $mode, implicit $exec +# GCN: %10:vgpr_32 = V_ADD_F32_e64_dpp %2, 4, %1, 8, %0, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec + +name: add_f32_e64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + + ; this should be combined as e64 + %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %0, 0, 1, implicit $mode, implicit $exec + + ; this should be combined and shrunk as all modifiers are default + %5:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec + %6:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %0, 0, 0, implicit $mode, implicit $exec + + ; this should be combined and shrunk as modifiers other than abs|neg are default + %7:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec + %8:vgpr_32 = V_ADD_F32_e64 1, %7, 2, %0, 0, 0, implicit $mode, implicit $exec + + ; this should be combined as e64 + %9:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec + %10:vgpr_32 = V_ADD_F32_e64 4, %9, 8, %0, 0, 0, implicit $mode, implicit $exec +... + +# check for e64 modifiers +# GCN-LABEL: name: add_u32_e64 +# GCN: %4:vgpr_32 = V_ADD_U32_dpp %2, %0, %1, 1, 15, 15, 1, implicit $exec +# GCN: %6:vgpr_32 = V_ADD_U32_e64_dpp %2, %0, %1, 1, 1, 15, 15, 1, implicit $exec + +name: add_u32_e64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + + ; this should be combined and shrunk as all modifiers are default + %3:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_ADD_U32_e64 %3, %1, 0, implicit $exec + + ; this should be combined as _e64 + %5:vgpr_32 = V_MOV_B32_dpp undef %2, %0, 1, 15, 15, 1, implicit $exec + %6:vgpr_32 = V_ADD_U32_e64 %5, %1, 1, implicit $exec +... + +# tests on sequences of dpp consumers +# GCN-LABEL: name: dpp_seq +# GCN: %4:vgpr_32 = V_ADD_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec +# GCN: %5:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec +# GCN: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec +# broken sequence: +# GCN: %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec + +name: dpp_seq +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec + %4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec + %5:vgpr_32 = V_SUB_U32_e32 %1, %3, implicit $exec + %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec + + %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec + %8:vgpr_32 = V_ADD_U32_e32 %7, %1, implicit $exec + ; this breaks the sequence + %9:vgpr_32 = V_SUB_U32_e32 5, %7, implicit $exec +... + +# tests on sequences of dpp consumers followed by control flow +# GCN-LABEL: name: dpp_seq_cf +# GCN: %4:vgpr_32 = V_ADD_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec +# GCN: %5:vgpr_32 = V_SUBREV_U32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec +# GCN: %6:vgpr_32 = V_OR_B32_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec + +name: dpp_seq_cf +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 14, 15, 0, implicit $exec + %4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec + %5:vgpr_32 = V_SUB_U32_e32 %1, %3, implicit $exec + %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec + + %7:sreg_32 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec + %8:sreg_32 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + bb.2: + SI_END_CF %8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec +... + +# GCN-LABEL: name: old_in_diff_bb +# GCN: %4:vgpr_32 = V_ADD_U32_dpp %0, %1, %0, 1, 1, 1, 0, implicit $exec + +name: old_in_diff_bb +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %3:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 1, 1, 0, implicit $exec + %4:vgpr_32 = V_ADD_U32_e32 %3, %0, implicit $exec +... + +# old reg def is in diff BB but bound_ctrl:1 - can combine +# GCN-LABEL: name: old_in_diff_bb_bctrl_zero +# GCN: %4:vgpr_32 = V_ADD_U32_dpp {{%[0-9]}}, %0, %1, 1, 15, 15, 1, implicit $exec + +name: old_in_diff_bb_bctrl_zero +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1 + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec +... + +# EXEC mask changed between def and use - cannot combine +# GCN-LABEL: name: exec_changed +# GCN: %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + +name: exec_changed +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec + %5:sreg_64 = COPY $exec, implicit-def $exec + %6:vgpr_32 = V_ADD_U32_e32 %3, %1, implicit $exec +... + +# test if $old definition is correctly tracked through subreg manipulation pseudos + +# GCN-LABEL: name: mul_old_subreg +# GCN: %7:vgpr_32 = V_MUL_I32_I24_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec + +name: mul_old_subreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_e32 42, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1 ; %5.sub0 is taken from %4 + %6:vgpr_32 = V_MOV_B32_dpp %5.sub0, %1, 1, 1, 1, 0, implicit $exec + %7:vgpr_32 = V_MUL_I32_I24_e32 %6, %0.sub1, implicit $exec +... + +# GCN-LABEL: name: add_old_subreg +# GCN: %5:vgpr_32 = V_ADD_U32_dpp %0.sub1, %1, %0.sub1, 1, 1, 1, 0, implicit $exec + +name: add_old_subreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = INSERT_SUBREG %0, %2, %subreg.sub1 ; %3.sub1 is inserted + %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 1, 1, 0, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec +... + +# GCN-LABEL: name: add_old_subreg_undef +# GCN: %5:vgpr_32 = V_ADD_U32_dpp undef %3.sub1, %1, %0.sub1, 1, 15, 15, 1, implicit $exec + +name: add_old_subreg_undef +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vreg_64 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:vreg_64 = REG_SEQUENCE %2, %subreg.sub0 ; %3.sub1 is undef + %4:vgpr_32 = V_MOV_B32_dpp %3.sub1, %1, 1, 15, 15, 1, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %4, %0.sub1, implicit $exec +... + +# Test instruction which does not have modifiers in VOP1 form but does in DPP form. +# GCN-LABEL: name: dpp_vop1 +# GCN: %3:vgpr_32 = V_CEIL_F32_dpp %0, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec +name: dpp_vop1 +tracksRegLiveness: true +body: | + bb.0: + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_MOV_B32_dpp %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $mode, implicit $exec +... + +# Test instruction which does not have modifiers in VOP2 form but does in DPP form. +# GCN-LABEL: name: dpp_min +# GCN: %3:vgpr_32 = V_MIN_F32_dpp %0, 0, undef %2:vgpr_32, 0, undef %4:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec +name: dpp_min +tracksRegLiveness: true +body: | + bb.0: + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_MOV_B32_dpp %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = V_MIN_F32_e32 %2, undef %3:vgpr_32, implicit $mode, implicit $exec +... + +# Test an undef old operand +# GCN-LABEL: name: dpp_undef_old +# GCN: %3:vgpr_32 = V_CEIL_F32_dpp undef %1:vgpr_32, 0, undef %2:vgpr_32, 1, 15, 15, 1, implicit $mode, implicit $exec +name: dpp_undef_old +tracksRegLiveness: true +body: | + bb.0: + %2:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_CEIL_F32_e32 %2, implicit $mode, implicit $exec +... + +# Do not combine a dpp mov which writes a physreg. +# GCN-LABEL: name: phys_dpp_mov_dst +# GCN: $vgpr0 = V_MOV_B32_dpp undef %0:vgpr_32, undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec +# GCN: %2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $mode, implicit $exec +name: phys_dpp_mov_dst +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = V_MOV_B32_dpp undef %1:vgpr_32, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec + %2:vgpr_32 = V_CEIL_F32_e32 $vgpr0, implicit $mode, implicit $exec +... + +# Do not combine a dpp mov which reads a physreg. +# GCN-LABEL: name: phys_dpp_mov_old_src +# GCN: %0:vgpr_32 = V_MOV_B32_dpp undef $vgpr0, undef %1:vgpr_32, 1, 15, 15, 1, implicit $exec +# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $mode, implicit $exec +name: phys_dpp_mov_old_src +tracksRegLiveness: true +body: | + bb.0: + %1:vgpr_32 = V_MOV_B32_dpp undef $vgpr0, undef %0:vgpr_32, 1, 15, 15, 1, implicit $exec + %2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $mode, implicit $exec +... + +# Do not combine a dpp mov which reads a physreg. +# GCN-LABEL: name: phys_dpp_mov_src +# GCN: %0:vgpr_32 = V_MOV_B32_dpp undef %1:vgpr_32, undef $vgpr0, 1, 15, 15, 1, implicit $exec +# GCN: %2:vgpr_32 = V_CEIL_F32_e32 %0, implicit $mode, implicit $exec +name: phys_dpp_mov_src +tracksRegLiveness: true +body: | + bb.0: + %1:vgpr_32 = V_MOV_B32_dpp undef %0:vgpr_32, undef $vgpr0, 1, 15, 15, 1, implicit $exec + %2:vgpr_32 = V_CEIL_F32_e32 %1, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: dpp_reg_sequence_both_combined +# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1 +# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3 +# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# GCN: %9:vgpr_32 = IMPLICIT_DEF +# GCN: %8:vgpr_32 = IMPLICIT_DEF +# GCN: %6:vgpr_32 = V_ADD_U32_dpp %9, %1.sub0, %2, 1, 15, 15, 1, implicit $exec +# GCN: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_both_combined +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: dpp_reg_sequence_first_combined +# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1 +# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3 +# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# GCN: %8:vgpr_32 = IMPLICIT_DEF +# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec +# GCN: %5:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, %4, %subreg.sub1 +# GCN: %6:vgpr_32 = V_ADD_U32_dpp %8, %1.sub0, %2, 1, 15, 15, 1, implicit $exec +# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_first_combined +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: dpp_reg_sequence_second_combined +# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1 +# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3 +# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec +# GCN: %8:vgpr_32 = IMPLICIT_DEF +# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, undef %4:vgpr_32, %subreg.sub1 +# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec +# GCN: %7:vgpr_32 = V_ADDC_U32_dpp %8, %1.sub1, %2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_second_combined +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: dpp_reg_sequence_none_combined +# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1 +# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3 +# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec +# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec +# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 +# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec +# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_none_combined +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 1, 1, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: dpp_reg_sequence_exec_changed +# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1 +# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3 +# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec +# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec +# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 +# GCN: S_BRANCH %bb.1 +# GCN: bb.1: +# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec +# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_exec_changed +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + S_BRANCH %bb.1 + + bb.1: + %6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: dpp_reg_sequence_subreg +# GCN: %0:vreg_64 = COPY $vgpr0_vgpr1 +# GCN: %1:vreg_64 = COPY $vgpr2_vgpr3 +# GCN: %2:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +# GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec +# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec +# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1 +# GCN: %6:vreg_64 = REG_SEQUENCE %5.sub0, %subreg.sub0, %5.sub1, %subreg.sub1 +# GCN: %7:vgpr_32 = V_ADD_U32_e32 %6.sub0, %2, implicit $exec +# GCN: %8:vgpr_32 = V_ADDC_U32_e32 %6.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp_reg_sequence_subreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %8:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5:vreg_64 = REG_SEQUENCE %4.sub0, %subreg.sub0, %4.sub1, %subreg.sub1 + %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %8, implicit $exec + %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %8, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: dpp_reg_sequence_src2_reject +#GCN: %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec +#GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec +#GCN: %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 +#GCN: %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +#GCN: %6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub0, 1, 2, implicit $mode, implicit $exec +#GCN: %7:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec +name: dpp_reg_sequence_src2_reject +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + ; use of dpp arg as src2, reject + %6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub0, 1, 2, implicit $mode, implicit $exec + ; cannot commute src0 and src2, and %4.sub0 already rejected, reject + %7:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: dpp_reg_sequence_src2 +#GCN: %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec +#GCN: %4:vreg_64 = REG_SEQUENCE undef %2:vgpr_32, %subreg.sub0, %3, %subreg.sub1 +#GCN: %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec +#GCN: %6:vgpr_32 = V_FMA_F32_e64_dpp %8, 2, %1.sub0, 2, %5, 2, %4.sub1, 1, 2, 1, 15, 15, 1, implicit $mode, implicit $exec +name: dpp_reg_sequence_src2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + %6:vgpr_32 = V_FMA_F32_e64 2, %4.sub0, 2, %5, 2, %4.sub1, 1, 2, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: dpp64_add64_impdef +# GCN: %3:vgpr_32 = V_ADD_U32_dpp %1.sub0, %0.sub0, undef %4:vgpr_32, 1, 15, 15, 1, implicit $exec +# GCN: %5:vgpr_32 = V_ADDC_U32_dpp %1.sub1, %0.sub1, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp64_add64_impdef +tracksRegLiveness: true +body: | + bb.0: + %0:vreg_64 = IMPLICIT_DEF + %1:vreg_64 = IMPLICIT_DEF + %2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 1, 15, 15, 1, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %2.sub0, undef %4:vgpr_32, implicit $exec + %6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec +... + +# GCN-LABEL: name: dpp64_add64_undef +# GCN: %3:vgpr_32 = V_ADD_U32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit $exec +# GCN: %5:vgpr_32 = V_ADDC_U32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, undef %4:vgpr_32, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec +name: dpp64_add64_undef +tracksRegLiveness: true +body: | + bb.0: + %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %2.sub0, undef %4:vgpr_32, implicit $exec + %6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec +... + + +# GCN-LABEL: name: cndmask_with_src2 +# GCN: %5:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %1, %4, implicit $exec +# GCN: %8:vgpr_32 = V_CNDMASK_B32_e64_dpp %2, 4, %1, 0, %1, %7, 1, 15, 15, 1, implicit $exec +name: cndmask_with_src2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + + %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec + %4:sreg_32_xm0_xexec = IMPLICIT_DEF + %5:vgpr_32 = V_CNDMASK_B32_e64 0, %3, 0, %1, %4, implicit $exec + + ; src2 is legal for _e64 + %6:vgpr_32 = V_MOV_B32_dpp %2, %1, 1, 15, 15, 1, implicit $exec + %7:sreg_32_xm0_xexec = IMPLICIT_DEF + %8:vgpr_32 = V_CNDMASK_B32_e64 4, %6, 0, %1, %7, implicit $exec +... + +--- + +# Make sure flags aren't dropped +# GCN-LABEL: name: flags_add_f32_e64 +# GCN: %4:vgpr_32 = nnan nofpexcept V_ADD_F32_dpp %2, 0, %1, 0, %0, 1, 15, 15, 1, implicit $mode, implicit $exec +name: flags_add_f32_e64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + + %3:vgpr_32 = V_MOV_B32_dpp undef %2, %1, 1, 15, 15, 1, implicit $exec + %4:vgpr_32 = nofpexcept nnan V_ADD_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %4 + +... + +# GCN-LABEL: name: dont_combine_more_than_one_operand +# GCN: %3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec +name: dont_combine_more_than_one_operand +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: dont_combine_more_than_one_operand_dpp_reg_sequence +# GCN: %5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec +# GCN: %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec +name: dont_combine_more_than_one_operand_dpp_reg_sequence +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 15, 15, 1, implicit $exec + %4:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec + %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT,PREGFX10,PREGFX10-OPT %s ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-NOOPT,PREGFX10,PREGFX10-NOOPT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=VI,VI-OPT %s ; FIXME: The register allocator / scheduler should be able to avoid these hazards. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s ; RUN: llc -march=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s ; GCN-LABEL: {{^}}dpp_test: ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} @@ -26,7 +27,7 @@ ; GCN-LABEL: {{^}}dpp_test1: -; GFX10: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GFX10,GFX11: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 @@ -51,7 +52,7 @@ } ; GCN-LABEL: {{^}}update_dpp64_test: -; GCN: load_dwordx2 v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] +; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) { @@ -65,12 +66,13 @@ ; GCN-LABEL: {{^}}update_dpp64_imm_old_test: ; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9 -; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047 +; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047 +; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047 -; GCN-DAG: load_dwordx2 v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] +; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]] ; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} -; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} +; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dpp64_imm_old_test(i64 addrspace(1)* %arg, i64 %in2) { diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir b/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir @@ -0,0 +1,68 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN + +--- + +name: vopc +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GCN-LABEL: name: vopc + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN: V_CMP_LT_F32_e32_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec + ; GCN: V_CMPX_EQ_I16_e32 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN: V_CMP_CLASS_F16_e32_dpp [[DEF]], 0, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec + ; GCN: [[V_CMP_GE_F16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_e64_dpp [[DEF]], 1, [[COPY1]], 0, [[COPY]], 1, 1, 15, 15, 1, implicit $mode, implicit $exec + ; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec + ; GCN: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp1]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec + ; GCN: V_CMP_CLASS_F32_e32_dpp [[DEF]], 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec + ; GCN: V_CMP_NGE_F16_e32_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN: [[V_CMP_NGE_F16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_e64_dpp [[DEF]], 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec + ; GCN: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F16_e64_dpp]], 10101, implicit-def $scc + ; GCN: V_CMP_GT_I32_e32_dpp [[DEF]], [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = IMPLICIT_DEF + + %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMP_LT_F32_e32 %4, %0, implicit-def $vcc, implicit $mode, implicit $exec + + ; unsafe to combine cmpx + %5:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMPX_EQ_I16_e32 %5, %0, implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec + + %6:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMP_CLASS_F16_e32 %6, %0, implicit-def $vcc, implicit $mode, implicit $exec + + %7:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %8:sgpr_32 = V_CMP_GE_F16_e64 1, %7, 0, %0, 1, implicit $mode, implicit $exec + + ; unsafe to combine cmpx + %9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMPX_GT_U32_nosdst_e64 %9, %0, implicit-def $exec, implicit $mode, implicit $exec + + %11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %12:sgpr_32 = V_CMP_CLASS_F32_e64 2, %11, %0, implicit $mode, implicit $exec + + ; shrink + %13:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %14:sgpr_32 = V_CMP_NGE_F16_e64 0, %13, 0, %0, 0, implicit $mode, implicit $exec + + ; do not shrink, sdst used + %15:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %16:sgpr_32 = V_CMP_NGE_F16_e64 0, %15, 0, %0, 0, implicit $mode, implicit $exec + %17:sgpr_32 = S_AND_B32 %16, 10101, implicit-def $scc + + ; commute + %18:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMP_LT_I32_e32 %0, %18, implicit-def $vcc, implicit $exec + +...