diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -70,6 +70,10 @@ // instructions. void processBundle(); + // Run on an individual instruction in hazard recognizer mode. This can be + // used on a newly inserted instruction before returning from PreEmitNoops. + void runOnInstruction(MachineInstr *MI); + int getWaitStatesSince(IsHazardFn IsHazard, int Limit); int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit); int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit); @@ -101,6 +105,7 @@ bool fixVALUPartialForwardingHazard(MachineInstr *MI); bool fixVALUTransUseHazard(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); + bool fixShift64HighRegBug(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -300,6 +300,20 @@ CurrCycleInstr = nullptr; } +void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) { + assert(IsHazardRecognizerMode); + + unsigned NumPreNoops = PreEmitNoops(MI); + EmitNoops(NumPreNoops); + if (MI->isInsideBundle()) + insertNoopsInBundle(MI, TII, NumPreNoops); + else + TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI), + NumPreNoops); + EmitInstruction(MI); + AdvanceCycle(); +} + unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { IsHazardRecognizerMode = true; CurrCycleInstr = MI; @@ -1087,6 +1101,7 @@ fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); fixWMMAHazards(MI); + fixShift64HighRegBug(MI); } bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { @@ -1739,6 +1754,105 @@ return true; } +bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) { + if (!ST.hasShift64HighRegBug()) + return false; + + switch (MI->getOpcode()) { + default: + return false; + case AMDGPU::V_LSHLREV_B64_e64: + case AMDGPU::V_LSHRREV_B64_e64: + case AMDGPU::V_ASHRREV_I64_e64: + break; + } + + MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0); + if (!Amt->isReg()) + return false; + + Register AmtReg = Amt->getReg(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + // Check if this is a last VGPR in the allocation block. + if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7) + return false; + + if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1)) + return false; + + MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1); + bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg); + bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI); + bool Overlapped = OverlappedSrc || OverlappedDst; + + assert(!OverlappedDst || !OverlappedSrc || + Src1->getReg() == MI->getOperand(0).getReg()); + assert(ST.needsAlignedVGPRs()); + static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1); + + Register NewReg; + for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass + : AMDGPU::VGPR_32RegClass) { + if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) { + NewReg = Reg; + break; + } + } + + Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1) + : NewReg; + Register NewAmtLo; + + if (Overlapped) + NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0); + + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + // Insert a full wait count because found register might be pending a wait. + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT)) + .addImm(0); + + // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them. + if (Overlapped) + runOnInstruction( + BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo) + .addDef(AmtReg - 1) + .addReg(AmtReg - 1) + .addReg(NewAmtLo)); + runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt) + .addDef(AmtReg) + .addReg(AmtReg) + .addReg(NewAmt)); + + // Instructions emitted after the current instruction will be processed by the + // parent loop of the hazard recognizer in a natural way. + BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), + AmtReg) + .addDef(NewAmt) + .addReg(NewAmt) + .addReg(AmtReg); + if (Overlapped) + BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32), + AmtReg - 1) + .addDef(NewAmtLo) + .addReg(NewAmtLo) + .addReg(AmtReg - 1); + + // Re-running hazard recognizer on the modified instruction is not necessary, + // inserted V_SWAP_B32 has already both read and write new registers so + // hazards related to these register has already been handled. + Amt->setReg(NewAmt); + Amt->setIsKill(false); + if (OverlappedDst) + MI->getOperand(0).setReg(NewReg); + if (OverlappedSrc) { + Src1->setReg(NewReg); + Src1->setIsKill(false); + } + + return true; +} + int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { int NSAtoVMEMWaitStates = 1; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1008,6 +1008,12 @@ return HasLdsBranchVmemWARHazard; } + // Shift amount of a 64 bit shift cannot be a highest allocated register + // if also at the end of the allocation block. + bool hasShift64HighRegBug() const { + return GFX90AInsts && !GFX940Insts; + } + // Has one cycle hazard on transcendental instruction feeding a // non transcendental VALU. bool hasTransForwardingHazard() const { return GFX940Insts; } diff --git a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir @@ -0,0 +1,250 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: highest_reg_shift_amt_v7 +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_v7 + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr0, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr0, implicit $exec + ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr2_vgpr3 = IMPLICIT_DEF + renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec +... + +--- +name: highest_reg_shift_amt_v15 +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_v15 + ; GCN: $vgpr15 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr0, $vgpr15 = V_SWAP_B32 $vgpr15, $vgpr0, implicit $exec + ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec + ; GCN-NEXT: $vgpr15, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr15, implicit $exec + $vgpr15 = IMPLICIT_DEF + $vgpr2_vgpr3 = IMPLICIT_DEF + renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr15, killed $vgpr2_vgpr3, implicit $exec +... + +--- +name: highest_reg_shift_amt_v255 +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_v255 + ; GCN: $vgpr255 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr0, $vgpr255 = V_SWAP_B32 $vgpr255, $vgpr0, implicit $exec + ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec + ; GCN-NEXT: $vgpr255, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr255, implicit $exec + $vgpr255 = IMPLICIT_DEF + $vgpr2_vgpr3 = IMPLICIT_DEF + renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr255, killed $vgpr2_vgpr3, implicit $exec +... + +--- +name: highest_reg_shift_amt_used_v0_dst +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_dst + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr2_vgpr3, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr2_vgpr3 = IMPLICIT_DEF + renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec +... + +--- +name: highest_reg_shift_amt_used_v0_src +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_src + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec + ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr0_vgpr1, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr0_vgpr1 = IMPLICIT_DEF + renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec +... + +--- +name: highest_reg_shift_amt_used_v0_both +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_used_v0_both + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr2, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr2, killed $vgpr0_vgpr1, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr0_vgpr1 = IMPLICIT_DEF + renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec +... + +--- +name: highest_reg_shift_amt_overlapped_src +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_src + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr2, implicit $exec + ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr3, implicit $exec + ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr3, $vgpr2_vgpr3, implicit $exec + ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr6_vgpr7 = IMPLICIT_DEF + renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec +... + +--- +name: highest_reg_shift_amt_overlapped_dst +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_dst + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr2, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr2, implicit $exec + ; GCN-NEXT: $vgpr3, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr3, implicit $exec + ; GCN-NEXT: $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr3, killed $vgpr0_vgpr1, implicit $exec + ; GCN-NEXT: $vgpr6, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr6, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr0_vgpr1 = IMPLICIT_DEF + renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec +... + +--- +name: highest_reg_shift_amt_overlapped_both +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_overlapped_both + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec + ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec + ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec + ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr6_vgpr7 = IMPLICIT_DEF + renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec +... + +--- +name: highest_reg_shift_amt_hazard_in_swap +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0_vgpr1 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: S_NOP 4 + ; GCN-NEXT: $vgpr4, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr4, implicit $exec + ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr4, killed $vgpr0_vgpr1, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr4 = V_SWAP_B32 $vgpr4, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr0_vgpr1 = IMPLICIT_DEF + $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr0_vgpr1, implicit $exec +... + +--- +name: highest_reg_shift_amt_hazard_in_swap2 +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap2 + ; GCN: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec + ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec + ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec + $vgpr7 = IMPLICIT_DEF + $vgpr6_vgpr7 = IMPLICIT_DEF + $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec + renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec +... + +--- +name: highest_reg_shift_amt_v7_bundle +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_v7_bundle + ; GCN: BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr7 { + ; GCN-NEXT: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr2_vgpr3 = IMPLICIT_DEF + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr0, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr0, implicit $exec + ; GCN-NEXT: renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 $vgpr0, killed $vgpr2_vgpr3, implicit $exec + ; GCN-NEXT: } + ; GCN-NEXT: $vgpr7, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr7, implicit $exec + BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr7 { + $vgpr7 = IMPLICIT_DEF + $vgpr2_vgpr3 = IMPLICIT_DEF + renamable $vgpr2_vgpr3 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr2_vgpr3, implicit $exec + } +... + +--- +name: highest_reg_shift_amt_hazard_in_swap2_bundle +body: | + bb.0: + + ; GCN-LABEL: name: highest_reg_shift_amt_hazard_in_swap2_bundle + ; GCN: BUNDLE implicit-def $vgpr1, implicit-def $vgpr6_vgpr7 { + ; GCN-NEXT: $vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr6_vgpr7 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec + ; GCN-NEXT: S_WAITCNT 0 + ; GCN-NEXT: $vgpr0, $vgpr6 = V_SWAP_B32 $vgpr6, $vgpr0, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr1, $vgpr7 = V_SWAP_B32 $vgpr7, $vgpr1, implicit $exec + ; GCN-NEXT: $vgpr0_vgpr1 = V_LSHRREV_B64_e64 $vgpr1, $vgpr0_vgpr1, implicit $exec + ; GCN-NEXT: } + ; GCN-NEXT: $vgpr6, $vgpr0 = V_SWAP_B32 $vgpr0, $vgpr6, implicit $exec + ; GCN-NEXT: $vgpr7, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr7, implicit $exec + BUNDLE implicit-def $vgpr1, implicit-def $vgpr6_vgpr7 { + $vgpr7 = IMPLICIT_DEF + $vgpr6_vgpr7 = IMPLICIT_DEF + $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr7, $vgpr7, $vgpr1, implicit $exec + renamable $vgpr6_vgpr7 = V_LSHRREV_B64_e64 killed $vgpr7, killed $vgpr6_vgpr7, implicit $exec + } +...