diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -968,10 +968,10 @@ if (LateCFGStructurize) { addPass(createAMDGPUMachineCFGStructurizerPass()); } - addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc() { + addPass(createSIWholeQuadModePass()); // FIXME: We have to disable the verifier here because of PHIElimination + // TwoAddressInstructions disabling it. @@ -987,6 +987,11 @@ } void GCNPassConfig::addOptimizedRegAlloc() { + // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation + // instructions that cause scheduling barriers. + insertPass(&MachineSchedulerID, &SIWholeQuadModeID); + insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); + if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); @@ -996,9 +1001,6 @@ // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); - // This must be run just after RegisterCoalescing. - insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); - if (EnableDCEInRA) insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -154,6 +154,11 @@ MachineRegisterInfo *MRI; LiveIntervals *LIS; + unsigned AndOpc; + unsigned XorTermrOpc; + unsigned OrSaveExecOpc; + unsigned Exec; + DenseMap Instructions; MapVector Blocks; SmallVector LiveMaskQueries; @@ -164,6 +169,8 @@ void markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist); + void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, + unsigned SubReg, char Flag, std::vector &Worklist); void markInstructionUses(const MachineInstr &MI, char Flag, std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); @@ -252,6 +259,8 @@ assert(!(Flag & StateExact) && Flag != 0); + LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); + // Remove any disabled states from the flag. The user that required it gets // an undefined value in the helper lanes. For example, this can happen if // the result of an atomic is used by instruction that requires WQM, where @@ -267,9 +276,70 @@ Worklist.push_back(&MI); } +/// Mark all relevant definitions of register \p Reg in usage \p UseMI. +void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, + Register Reg, unsigned SubReg, char Flag, + std::vector &Worklist) { + assert(!MRI->isSSA()); + + LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); + + LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); + if (!UseLRQ.valueIn()) + return; + + SmallPtrSet Visited; + SmallVector ToProcess; + ToProcess.push_back(UseLRQ.valueIn()); + do { + const VNInfo *Value = ToProcess.pop_back_val(); + Visited.insert(Value); + + if (Value->isPHIDef()) { + // Need to mark all defs used in the PHI node + const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def); + assert(MBB && "Phi-def has no defining MBB"); + for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), + PE = MBB->pred_end(); + PI != PE; ++PI) { + if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) { + if (!Visited.count(VN)) + ToProcess.push_back(VN); + } + } + } else { + MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); + assert(MI && "Def has no defining instruction"); + markInstruction(*MI, Flag, Worklist); + + // Iterate over all operands to find relevant definitions + for (const MachineOperand &Op : MI->operands()) { + if (!(Op.isReg() && Op.getReg() == Reg)) + continue; + + // Does this def cover whole register? + bool DefinesFullReg = + Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg; + if (!DefinesFullReg) { + // Partial definition; need to follow and mark input value + LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); + if (const VNInfo *VN = LRQ.valueIn()) { + if (!Visited.count(VN)) + ToProcess.push_back(VN); + } + } + } + } + } while (!ToProcess.empty()); +} + /// Mark all instructions defining the uses in \p MI with \p Flag. void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, std::vector &Worklist) { + + LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " + << MI); + for (const MachineOperand &Use : MI.uses()) { if (!Use.isReg() || !Use.isUse()) continue; @@ -289,20 +359,28 @@ if (!Value) continue; - // Since we're in machine SSA, we do not need to track physical - // registers across basic blocks. - if (Value->isPHIDef()) - continue; - - markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, - Worklist); + if (MRI->isSSA()) { + // Since we're in machine SSA, we do not need to track physical + // registers across basic blocks. + if (Value->isPHIDef()) + continue; + markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, + Worklist); + } else { + markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); + } } continue; } - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, Flag, Worklist); + if (MRI->isSSA()) { + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) + markInstruction(DefMI, Flag, Worklist); + } else { + LiveRange &LR = LIS->getInterval(Reg); + markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist); + } } } @@ -572,7 +650,10 @@ break; Idx = Next; } else { - SlotIndex Next = S->end.getNextIndex().getBaseIndex(); + MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex()); + assert(EndMI && "Segment does not end on valid instruction"); + auto NextI = std::next(EndMI->getIterator()); + SlotIndex Next = LIS->getInstructionIndex(*NextI); if (Next > LastIdx) break; Idx = Next; @@ -588,6 +669,23 @@ MBBI = MBB.end(); } + // Move insertion point past any operations modifying EXEC. + // This assumes that the value of SCC defined by any of these operations + // does not need to be preserved. + while (MBBI != Last) { + bool IsExecDef = false; + for (const MachineOperand &MO : MBBI->operands()) { + if (MO.isReg() && MO.isDef()) { + IsExecDef |= + MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; + } + } + if (!IsExecDef) + break; + MBBI++; + S = nullptr; + } + if (S) MBBI = saveSCC(MBB, MBBI); @@ -682,8 +780,11 @@ const TargetRegisterClass *BoolRC = TRI->getBoolRC(); auto II = MBB.getFirstNonPHI(), IE = MBB.end(); - if (isEntry) - ++II; // Skip the instruction that saves LiveMask + if (isEntry) { + // Skip the instruction that saves LiveMask + if (II != IE && II->getOpcode() == AMDGPU::COPY) + ++II; + } // This stores the first instruction where it's safe to switch from WQM to // Exact or vice versa. @@ -694,6 +795,7 @@ // FirstWQM since if it's safe to switch to/from WWM, it must be safe to // switch to/from WQM as well. MachineBasicBlock::iterator FirstWWM = IE; + for (;;) { MachineBasicBlock::iterator Next = II; char Needs = StateExact | StateWQM; // WWM is disabled by default @@ -806,6 +908,7 @@ if (II == IE) break; + II = Next; } assert(!SavedWQMReg); @@ -816,6 +919,7 @@ for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); Register Dest = MI->getOperand(0).getReg(); + MachineInstr *Copy = BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) .addReg(LiveMaskReg); @@ -830,18 +934,35 @@ assert(MI->getNumExplicitOperands() == 2); const Register Reg = MI->getOperand(0).getReg(); + const unsigned SubReg = MI->getOperand(0).getSubReg(); if (TRI->isVGPR(*MRI, Reg)) { const TargetRegisterClass *regClass = Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg); + if (SubReg) + regClass = TRI->getSubRegClass(regClass, SubReg); const unsigned MovOp = TII->getMovOpcode(regClass); MI->setDesc(TII->get(MovOp)); // And make it implicitly depend on exec (like all VALU movs should do). MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); - } else { + } else if (!MRI->isSSA()) { + // Remove early-clobber and exec dependency from simple SGPR copies. + // This allows some to be eliminated during/post RA. + LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI); + if (MI->getOperand(0).isEarlyClobber()) { + LIS->removeInterval(Reg); + MI->getOperand(0).setIsEarlyClobber(false); + LIS->createAndComputeVirtRegInterval(Reg); + } + int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC); + while (Index >= 0) { + MI->RemoveOperand(Index); + Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC); + } MI->setDesc(TII->get(AMDGPU::COPY)); + LLVM_DEBUG(dbgs() << " -> " << *MI); } } for (MachineInstr *MI : LowerToCopyInstrs) { @@ -877,9 +998,20 @@ MRI = &MF.getRegInfo(); LIS = &getAnalysis(); + if (ST->isWave32()) { + AndOpc = AMDGPU::S_AND_B32; + XorTermrOpc = AMDGPU::S_XOR_B32_term; + OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; + Exec = AMDGPU::EXEC_LO; + } else { + AndOpc = AMDGPU::S_AND_B64; + XorTermrOpc = AMDGPU::S_XOR_B64_term; + OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; + Exec = AMDGPU::EXEC; + } + char GlobalFlags = analyzeFunction(MF); unsigned LiveMaskReg = 0; - unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(Exec); if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty()) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -5,7 +5,11 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -15,12 +19,8 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 @@ -29,7 +29,10 @@ ; ; GFX10NSA-LABEL: gather4_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -39,15 +42,12 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -58,24 +58,24 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %face) { ; GFX9-LABEL: gather4_cube: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da @@ -84,28 +84,28 @@ ; ; GFX10NSA-LABEL: gather4_cube: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -116,24 +116,24 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %slice) { ; GFX9-LABEL: gather4_2darray: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da @@ -142,28 +142,28 @@ ; ; GFX10NSA-LABEL: gather4_2darray: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -174,7 +174,11 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -184,12 +188,8 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -198,7 +198,10 @@ ; ; GFX10NSA-LABEL: gather4_c_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -208,15 +211,12 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -227,24 +227,24 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -253,28 +253,28 @@ ; ; GFX10NSA-LABEL: gather4_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -285,24 +285,24 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_c_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -311,28 +311,28 @@ ; ; GFX10NSA-LABEL: gather4_c_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -343,7 +343,11 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t) { ; GFX9-LABEL: gather4_b_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -353,12 +357,8 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 @@ -367,7 +367,10 @@ ; ; GFX10NSA-LABEL: gather4_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -377,15 +380,12 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -396,7 +396,11 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_b_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -406,12 +410,8 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -420,7 +420,10 @@ ; ; GFX10NSA-LABEL: gather4_c_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -430,15 +433,12 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -449,24 +449,24 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 @@ -475,28 +475,28 @@ ; ; GFX10NSA-LABEL: gather4_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -507,24 +507,24 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, half %s, half %t, half %clamp) { ; GFX9-LABEL: gather4_c_b_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 +; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 @@ -533,28 +533,28 @@ ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 +; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -26,7 +26,10 @@ ; ; GFX10NSA-LABEL: gather4_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -36,12 +39,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -74,7 +74,10 @@ ; ; GFX10NSA-LABEL: gather4_cube: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -84,12 +87,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -122,7 +122,10 @@ ; ; GFX10NSA-LABEL: gather4_2darray: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -132,12 +135,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -170,7 +170,10 @@ ; ; GFX10NSA-LABEL: gather4_c_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -180,12 +183,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -218,7 +218,10 @@ ; ; GFX10NSA-LABEL: gather4_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -228,12 +231,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -266,7 +266,10 @@ ; ; GFX10NSA-LABEL: gather4_c_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -276,12 +279,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -314,7 +314,10 @@ ; ; GFX10NSA-LABEL: gather4_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -324,12 +327,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -362,7 +362,10 @@ ; ; GFX10NSA-LABEL: gather4_c_b_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -372,12 +375,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -410,7 +410,10 @@ ; ; GFX10NSA-LABEL: gather4_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -420,12 +423,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -458,7 +458,10 @@ ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -468,12 +471,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -674,7 +674,10 @@ ; ; GFX10NSA-LABEL: gather4_2d_dmask_2: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -684,12 +687,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -722,7 +722,10 @@ ; ; GFX10NSA-LABEL: gather4_2d_dmask_4: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -732,12 +735,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog @@ -770,7 +770,10 @@ ; ; GFX10NSA-LABEL: gather4_2d_dmask_8: ; GFX10NSA: ; %bb.0: ; %main_body +; GFX10NSA-NEXT: s_mov_b32 s1, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 +; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -780,12 +783,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 -; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi -; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll @@ -26,7 +26,10 @@ ; ; GFX10-LABEL: gather4_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -36,12 +39,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -74,7 +74,10 @@ ; ; GFX10-LABEL: gather4_c_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -84,12 +87,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -122,7 +122,10 @@ ; ; GFX10-LABEL: gather4_cl_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -132,12 +135,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -170,7 +170,10 @@ ; ; GFX10-LABEL: gather4_c_cl_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -180,12 +183,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -218,7 +218,10 @@ ; ; GFX10-LABEL: gather4_b_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -228,12 +231,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -266,7 +266,10 @@ ; ; GFX10-LABEL: gather4_c_b_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -276,12 +279,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -356,7 +356,10 @@ ; ; GFX10-LABEL: gather4_c_b_cl_o_2d: ; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s1, exec_lo ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 @@ -366,12 +369,9 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s14, exec_lo ; GFX10-NEXT: s_mov_b32 s10, s12 ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) @@ -62,7 +62,9 @@ ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { entry: @@ -86,7 +88,9 @@ ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %vindex) { entry: @@ -160,7 +164,9 @@ ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -54,7 +54,9 @@ ; DPPCOMB: v_add_u32_dpp ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { entry: @@ -160,7 +162,9 @@ ; DPPCOMB: v_add_u32_dpp ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -370,17 +370,16 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -392,24 +391,24 @@ ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB2_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB2_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -422,17 +421,16 @@ ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -444,23 +442,23 @@ ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB2_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB2_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 @@ -472,45 +470,49 @@ ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB2_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 @@ -521,7 +523,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -531,38 +533,39 @@ ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB2_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 @@ -571,9 +574,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB2_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -607,17 +610,16 @@ ; GFX8-LABEL: add_i32_varying_gfx1032: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -629,24 +631,24 @@ ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB3_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB3_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -659,17 +661,16 @@ ; GFX9-LABEL: add_i32_varying_gfx1032: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -681,23 +682,23 @@ ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB3_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB3_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 @@ -709,45 +710,49 @@ ; ; GFX1064-LABEL: add_i32_varying_gfx1032: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB3_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 @@ -758,7 +763,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -768,38 +773,39 @@ ; ; GFX1032-LABEL: add_i32_varying_gfx1032: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB3_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 @@ -808,9 +814,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -844,17 +850,16 @@ ; GFX8-LABEL: add_i32_varying_gfx1064: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -866,24 +871,24 @@ ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -896,17 +901,16 @@ ; GFX9-LABEL: add_i32_varying_gfx1064: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -918,23 +922,23 @@ ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 @@ -946,45 +950,49 @@ ; ; GFX1064-LABEL: add_i32_varying_gfx1064: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 @@ -995,7 +1003,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,38 +1013,39 @@ ; ; GFX1032-LABEL: add_i32_varying_gfx1064: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 @@ -1045,9 +1054,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,17 +1922,16 @@ ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -1935,24 +1943,24 @@ ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB10_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB10_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 @@ -1965,17 +1973,16 @@ ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -1987,23 +1994,23 @@ ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB10_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB10_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 @@ -2015,45 +2022,49 @@ ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB10_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 @@ -2064,7 +2075,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2074,38 +2085,39 @@ ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB10_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 @@ -2114,9 +2126,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2649,16 +2661,16 @@ ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, -1 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -2670,24 +2682,24 @@ ; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB14_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB14_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 @@ -2700,16 +2712,16 @@ ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, -1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, -1 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -2721,23 +2733,23 @@ ; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB14_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB14_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 @@ -2749,44 +2761,49 @@ ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 @@ -2797,7 +2814,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2807,37 +2824,39 @@ ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 @@ -2846,9 +2865,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB14_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2882,17 +2901,16 @@ ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -2904,24 +2922,24 @@ ; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB15_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB15_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 @@ -2934,17 +2952,16 @@ ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -2956,23 +2973,23 @@ ; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB15_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB15_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 @@ -2984,45 +3001,49 @@ ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 @@ -3033,7 +3054,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3043,38 +3064,39 @@ ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 @@ -3083,9 +3105,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB15_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3119,17 +3141,16 @@ ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -3141,24 +3162,24 @@ ; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB16_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB16_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 @@ -3171,17 +3192,16 @@ ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -3193,23 +3213,23 @@ ; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB16_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB16_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 @@ -3221,45 +3241,49 @@ ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 @@ -3270,7 +3294,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3280,38 +3304,39 @@ ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 @@ -3320,9 +3345,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB16_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3356,16 +3381,16 @@ ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3377,24 +3402,24 @@ ; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB17_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB17_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 @@ -3407,16 +3432,16 @@ ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3428,23 +3453,23 @@ ; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB17_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB17_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 @@ -3456,17 +3481,14 @@ ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf @@ -3474,26 +3496,36 @@ ; GFX1064-NEXT: v_mov_b32_e32 v3, v2 ; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 @@ -3514,9 +3546,6 @@ ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 @@ -3524,27 +3553,34 @@ ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 ; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 @@ -3553,7 +3589,7 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB17_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 @@ -3774,16 +3810,16 @@ ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, v1 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3795,24 +3831,24 @@ ; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB19_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB19_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 @@ -3825,16 +3861,16 @@ ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -3846,23 +3882,23 @@ ; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB19_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB19_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 @@ -3874,17 +3910,14 @@ ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf @@ -3892,26 +3925,36 @@ ; GFX1064-NEXT: v_mov_b32_e32 v3, v2 ; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 +; GFX1064-NEXT: v_mov_b32_e32 v3, s4 ; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 +; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 +; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 @@ -3932,9 +3975,6 @@ ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 @@ -3942,27 +3982,34 @@ ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: v_mov_b32_e32 v3, v2 ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 +; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 ; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 @@ -3971,7 +4018,7 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB19_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 @@ -4192,17 +4239,16 @@ ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -4214,24 +4260,24 @@ ; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB21_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB21_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 @@ -4244,17 +4290,16 @@ ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -4266,23 +4311,23 @@ ; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB21_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB21_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 @@ -4294,45 +4339,49 @@ ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 @@ -4343,7 +4392,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -4353,38 +4402,39 @@ ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s3 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 @@ -4393,9 +4443,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB21_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -4611,16 +4661,16 @@ ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, -1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, -1 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -4632,24 +4682,24 @@ ; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v2, 63 +; GFX8-NEXT: v_readlane_b32 s4, v2, 63 ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_mov_b64 exec, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB23_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB23_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 @@ -4662,16 +4712,16 @@ ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, -1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, -1 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -4683,23 +4733,23 @@ ; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v2, 63 +; GFX9-NEXT: v_readlane_b32 s4, v2, 63 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB23_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB23_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 @@ -4711,44 +4761,49 @@ ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v3, -1 +; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s2 -; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 -; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB23_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s3 +; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 @@ -4759,7 +4814,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -4769,37 +4824,39 @@ ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 -; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 +; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, -1 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: ; implicit-def: $vcc_hi +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB23_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s3 +; GFX1032-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 @@ -4808,9 +4865,9 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB23_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -208,17 +208,16 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[10:11], exec -; GFX8-NEXT: ; implicit-def: $vgpr3 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX8-NEXT: s_cbranch_execz BB1_4 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_mov_b64 s[10:11], exec -; GFX8-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[12:13] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 +; GFX8-NEXT: s_mov_b64 exec, s[10:11] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec @@ -250,31 +249,30 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: BB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX8-NEXT: s_cbranch_vccnz BB1_6 ; GFX8-NEXT: ; %bb.5: ; %if -; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: BB1_6: ; %UnifiedReturnBlock ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[10:11], exec -; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX9-NEXT: s_cbranch_execz BB1_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_mov_b64 s[10:11], exec -; GFX9-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[12:13] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 +; GFX9-NEXT: s_mov_b64 exec, s[10:11] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec @@ -306,53 +304,54 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v3, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 ; GFX9-NEXT: BB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz BB1_6 ; GFX9-NEXT: ; %bb.5: ; %if -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: BB1_6: ; %UnifiedReturnBlock ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-NEXT: ; implicit-def: $vgpr4 -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX1064-NEXT: s_cbranch_execz BB1_4 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_mov_b64 s[10:11], exec -; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mov_b64 exec, s[12:13] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s11, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s12 -; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v2, 15 -; GFX1064-NEXT: v_readlane_b32 s13, v2, 31 -; GFX1064-NEXT: v_writelane_b32 v1, s12, 16 -; GFX1064-NEXT: v_readlane_b32 s12, v2, 63 -; GFX1064-NEXT: v_writelane_b32 v1, s13, 32 -; GFX1064-NEXT: v_readlane_b32 s13, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s13, 48 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s12, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s12 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s12, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s13, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s12, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1064-NEXT: v_readlane_b32 s12, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s14, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s13, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[10:11] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1064-NEXT: v_writelane_b32 v3, s14, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[10:11] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 @@ -366,69 +365,69 @@ ; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 -; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s4, v0 ; GFX1064-NEXT: BB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX1064-NEXT: s_cbranch_vccnz BB1_6 ; GFX1064-NEXT: ; %bb.5: ; %if -; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: BB1_6: ; %UnifiedReturnBlock ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr4 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: ; implicit-def: $vcc_hi -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 ; GFX1032-NEXT: s_cbranch_execz BB1_4 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_mov_b32 s9, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mov_b32 exec_lo, s10 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s9, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s10, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s11, v2, 15 -; GFX1032-NEXT: v_writelane_b32 v1, s11, 16 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s11, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s10, v1, 15 +; GFX1032-NEXT: s_mov_b32 exec_lo, s9 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s10, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB1_3 ; GFX1032-NEXT: ; %bb.2: -; GFX1032-NEXT: v_mov_b32_e32 v0, s10 +; GFX1032-NEXT: v_mov_b32_e32 v0, s11 +; GFX1032-NEXT: s_mov_b32 s10, s11 ; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1032-NEXT: BB1_3: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s4, v0 ; GFX1032-NEXT: BB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz BB1_6 ; GFX1032-NEXT: ; %bb.5: ; %if -; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: BB1_6: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) @@ -56,7 +56,9 @@ ; DPPCOMB: v_add_u32_dpp ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { entry: @@ -125,7 +127,9 @@ ; DPPCOMB: v_add_u32_dpp ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) @@ -56,7 +56,9 @@ ; DPPCOMB: v_add_u32_dpp ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { entry: @@ -138,7 +140,9 @@ ; DPPCOMB: v_add_u32_dpp ; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 ; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] +; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -17,12 +17,12 @@ ; GFX10-LABEL: gather4_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -45,12 +45,12 @@ ; GFX10-LABEL: gather4_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -73,12 +73,12 @@ ; GFX10-LABEL: gather4_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -101,12 +101,12 @@ ; GFX10-LABEL: gather4_c_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -129,12 +129,12 @@ ; GFX10-LABEL: gather4_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -159,12 +159,12 @@ ; GFX10-LABEL: gather4_c_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -187,12 +187,12 @@ ; GFX10-LABEL: gather4_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -215,12 +215,12 @@ ; GFX10-LABEL: gather4_c_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -245,12 +245,12 @@ ; GFX10-LABEL: gather4_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -276,12 +276,12 @@ ; GFX10-LABEL: gather4_c_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -15,10 +15,10 @@ ; GFX10-LABEL: sample_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -41,12 +41,12 @@ ; GFX10-LABEL: sample_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -69,12 +69,12 @@ ; GFX10-LABEL: sample_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -97,12 +97,12 @@ ; GFX10-LABEL: sample_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -125,12 +125,12 @@ ; GFX10-LABEL: sample_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -153,12 +153,12 @@ ; GFX10-LABEL: sample_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -179,10 +179,10 @@ ; GFX10-LABEL: sample_c_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -205,12 +205,12 @@ ; GFX10-LABEL: sample_c_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -233,12 +233,12 @@ ; GFX10-LABEL: sample_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -261,12 +261,12 @@ ; GFX10-LABEL: sample_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -289,12 +289,12 @@ ; GFX10-LABEL: sample_c_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -319,12 +319,12 @@ ; GFX10-LABEL: sample_c_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -345,10 +345,10 @@ ; GFX10-LABEL: sample_b_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -371,12 +371,12 @@ ; GFX10-LABEL: sample_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -397,10 +397,10 @@ ; GFX10-LABEL: sample_c_b_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -423,12 +423,12 @@ ; GFX10-LABEL: sample_c_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -451,12 +451,12 @@ ; GFX10-LABEL: sample_b_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -481,12 +481,12 @@ ; GFX10-LABEL: sample_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -509,12 +509,12 @@ ; GFX10-LABEL: sample_c_b_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -540,12 +540,12 @@ ; GFX10-LABEL: sample_c_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -35,10 +35,10 @@ ; GFX10-LABEL: image_sample_2d_f16: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -52,14 +52,14 @@ ; TONGA-NEXT: s_mov_b64 s[14:15], exec ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: v_mov_b32_e32 v2, 0 -; TONGA-NEXT: v_mov_b32_e32 v4, s12 -; TONGA-NEXT: v_mov_b32_e32 v5, s13 ; TONGA-NEXT: v_mov_b32_e32 v3, v2 ; TONGA-NEXT: s_and_b64 exec, exec, s[14:15] ; TONGA-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; TONGA-NEXT: v_mov_b32_e32 v0, s12 +; TONGA-NEXT: v_mov_b32_e32 v1, s13 ; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: flat_store_dword v[0:1], v3 ; TONGA-NEXT: v_mov_b32_e32 v0, v2 -; TONGA-NEXT: flat_store_dword v[4:5], v3 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: ; return to shader part epilog ; @@ -68,14 +68,14 @@ ; GFX81-NEXT: s_mov_b64 s[14:15], exec ; GFX81-NEXT: s_wqm_b64 exec, exec ; GFX81-NEXT: v_mov_b32_e32 v2, 0 -; GFX81-NEXT: v_mov_b32_e32 v4, s12 -; GFX81-NEXT: v_mov_b32_e32 v5, s13 ; GFX81-NEXT: v_mov_b32_e32 v3, v2 ; GFX81-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX81-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; GFX81-NEXT: v_mov_b32_e32 v0, s12 +; GFX81-NEXT: v_mov_b32_e32 v1, s13 ; GFX81-NEXT: s_waitcnt vmcnt(0) +; GFX81-NEXT: flat_store_dword v[0:1], v3 ; GFX81-NEXT: v_mov_b32_e32 v0, v2 -; GFX81-NEXT: flat_store_dword v[4:5], v3 ; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; @@ -84,31 +84,31 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 ; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: global_store_dword v[4:5], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: image_sample_2d_f16_tfe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s14, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: v_mov_b32_e32 v0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s13 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[0:1], v3, off ; GFX10-NEXT: v_mov_b32_e32 v0, v2 -; GFX10-NEXT: global_store_dword v[4:5], v3, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -240,10 +240,10 @@ ; GFX10-LABEL: image_sample_b_2d_v3f16: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -304,7 +304,6 @@ ; GFX10-LABEL: image_sample_b_2d_v3f16_tfe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -314,6 +313,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -366,10 +366,10 @@ ; GFX10-LABEL: image_sample_b_2d_v4f16: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -431,7 +431,6 @@ ; GFX10-LABEL: image_sample_b_2d_v4f16_tfe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -441,6 +440,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -25,10 +25,10 @@ ; GFX10-LABEL: sample_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -39,18 +39,18 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { ; VERDE-LABEL: sample_1d_tfe: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: s_mov_b64 s[16:17], exec +; VERDE-NEXT: s_mov_b64 s[14:15], exec ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: v_mov_b32_e32 v5, v0 ; VERDE-NEXT: v_mov_b32_e32 v0, 0 -; VERDE-NEXT: s_mov_b32 s15, 0xf000 -; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v3, v0 ; VERDE-NEXT: v_mov_b32_e32 v4, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[16:17] +; VERDE-NEXT: s_and_b64 exec, exec, s[14:15] ; VERDE-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe +; VERDE-NEXT: s_mov_b32 s15, 0xf000 +; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -62,36 +62,36 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, s12 -; GFX6789-NEXT: v_mov_b32_e32 v7, s13 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe +; GFX6789-NEXT: v_mov_b32_e32 v5, s12 +; GFX6789-NEXT: v_mov_b32_e32 v6, s13 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[6:7], v4, off +; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s12 ; encoding: [0x0c,0x02,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, s13 ; encoding: [0x0d,0x02,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] +; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -130,13 +130,13 @@ ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x01,0x81,0xf0,0x02,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -178,13 +178,13 @@ ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x02,0x81,0xf0,0x02,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -226,13 +226,13 @@ ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_3: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x04,0x81,0xf0,0x02,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -274,13 +274,13 @@ ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_4: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x08,0x81,0xf0,0x02,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -324,7 +324,6 @@ ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_12: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -332,6 +331,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x03,0x81,0xf0,0x03,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -377,7 +377,6 @@ ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_24: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -385,6 +384,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0a,0x81,0xf0,0x03,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -432,7 +432,6 @@ ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_134: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] @@ -441,6 +440,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0d,0x81,0xf0,0x04,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -461,18 +461,18 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { ; VERDE-LABEL: sample_1d_lwe: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: s_mov_b64 s[16:17], exec +; VERDE-NEXT: s_mov_b64 s[14:15], exec ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: v_mov_b32_e32 v5, v0 ; VERDE-NEXT: v_mov_b32_e32 v0, 0 -; VERDE-NEXT: s_mov_b32 s15, 0xf000 -; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: v_mov_b32_e32 v3, v0 ; VERDE-NEXT: v_mov_b32_e32 v4, v0 -; VERDE-NEXT: s_and_b64 exec, exec, s[16:17] +; VERDE-NEXT: s_and_b64 exec, exec, s[14:15] ; VERDE-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe +; VERDE-NEXT: s_mov_b32 s15, 0xf000 +; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -484,36 +484,36 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, s12 -; GFX6789-NEXT: v_mov_b32_e32 v7, s13 ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe +; GFX6789-NEXT: v_mov_b32_e32 v5, s12 +; GFX6789-NEXT: v_mov_b32_e32 v6, s13 ; GFX6789-NEXT: s_waitcnt vmcnt(0) -; GFX6789-NEXT: global_store_dword v[6:7], v4, off +; GFX6789-NEXT: global_store_dword v[5:6], v4, off ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_lwe: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s14, exec_lo ; encoding: [0x7e,0x03,0x8e,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s12 ; encoding: [0x0c,0x02,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, s13 ; encoding: [0x0d,0x02,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; encoding: [0x7e,0x0e,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] +; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -546,10 +546,10 @@ ; GFX10-LABEL: sample_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -579,10 +579,10 @@ ; GFX10-LABEL: sample_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -612,10 +612,10 @@ ; GFX10-LABEL: sample_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; encoding: [0x18,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -645,10 +645,10 @@ ; GFX10-LABEL: sample_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x20,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -678,10 +678,10 @@ ; GFX10-LABEL: sample_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -711,10 +711,10 @@ ; GFX10-LABEL: sample_c_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -744,10 +744,10 @@ ; GFX10-LABEL: sample_c_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -777,10 +777,10 @@ ; GFX10-LABEL: sample_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -810,10 +810,10 @@ ; GFX10-LABEL: sample_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -843,10 +843,10 @@ ; GFX10-LABEL: sample_c_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -876,10 +876,10 @@ ; GFX10-LABEL: sample_c_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -909,10 +909,10 @@ ; GFX10-LABEL: sample_b_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -942,10 +942,10 @@ ; GFX10-LABEL: sample_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -975,10 +975,10 @@ ; GFX10-LABEL: sample_c_b_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1008,10 +1008,10 @@ ; GFX10-LABEL: sample_c_b_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1041,10 +1041,10 @@ ; GFX10-LABEL: sample_b_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1074,10 +1074,10 @@ ; GFX10-LABEL: sample_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1107,10 +1107,10 @@ ; GFX10-LABEL: sample_c_b_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1140,10 +1140,10 @@ ; GFX10-LABEL: sample_c_b_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1896,10 +1896,10 @@ ; GFX10-LABEL: sample_1d_unorm: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1929,10 +1929,10 @@ ; GFX10-LABEL: sample_1d_glc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; encoding: [0x00,0x2f,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1962,10 +1962,10 @@ ; GFX10-LABEL: sample_1d_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; encoding: [0x00,0x0f,0x80,0xf2,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1995,10 +1995,10 @@ ; GFX10-LABEL: sample_1d_glc_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc slc ; encoding: [0x00,0x2f,0x80,0xf2,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2028,10 +2028,10 @@ ; GFX10-LABEL: adjust_writemask_sample_0: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2062,10 +2062,10 @@ ; GFX10-LABEL: adjust_writemask_sample_01: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x03,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2096,10 +2096,10 @@ ; GFX10-LABEL: adjust_writemask_sample_012: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x07,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2130,10 +2130,10 @@ ; GFX10-LABEL: adjust_writemask_sample_12: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2164,10 +2164,10 @@ ; GFX10-LABEL: adjust_writemask_sample_03: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x09,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2198,10 +2198,10 @@ ; GFX10-LABEL: adjust_writemask_sample_13: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2232,10 +2232,10 @@ ; GFX10-LABEL: adjust_writemask_sample_123: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0e,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2284,10 +2284,10 @@ ; GFX10-LABEL: adjust_writemask_sample_123_to_12: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -2318,10 +2318,10 @@ ; GFX10-LABEL: adjust_writemask_sample_013_to_13: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; encoding: [0x7e,0x03,0x8c,0xbe] -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -19,8 +19,10 @@ ; CHECK-LABEL: {{^}}test2: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; Following copy should go away: +; CHECK: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]] ; CHECK-DAG: s_wqm_b64 exec, exec -; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] +; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[COPY]] ; CHECK: image_sample v0, [[VAR]], define amdgpu_ps float @test2() #0 { %live = call i1 @llvm.amdgcn.ps.live() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -61,10 +61,10 @@ ;CHECK: s_wqm_b64 exec, exec ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: v_add_f32_e32 ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: buffer_store_dword -;CHECK; s_wqm_b64 exec, exec -;CHECK: v_add_f32_e32 define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -80,12 +80,14 @@ ; Make sure the transition from Exact to WWM then softwqm does not trigger WQM. ; ;CHECK-LABEL: {{^}}test_wwm1: +;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword +;CHECK: s_mov_b64 exec, [[ORIG0]] ;CHECK: buffer_store_dword -;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: s_mov_b64 exec, [[ORIG1]] ;CHECK-NOT: s_wqm_b64 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -93,11 +93,10 @@ ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: v_mul_lo_u32 [[MUL:v[0-9]+]], v0, v1 -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: store -;CHECK: s_wqm_b64 exec, exec ;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: image_sample +;CHECK: store define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { main_body: %c.1 = mul i32 %c, %d @@ -578,10 +577,10 @@ ;CHECK: buffer_store_dword ;CHECK: s_wqm_b64 exec, exec ;CHECK: v_cmpx_ -;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] -;CHECK: buffer_store_dword -;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: image_sample +;CHECK: buffer_store_dword define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -10,16 +10,18 @@ %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) -; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] -; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] +; GFX9: s_or_saveexec_b64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, -1 + +; GFX9-DAG: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DAG: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] +; GFX9-DAG: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) %tmp121 = add i32 %tmp105, %tmp120 %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) -; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] -; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] +; GFX9-DAG: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DAG: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] +; GFX9-DAG: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) %tmp136 = add i32 %tmp107, %tmp135 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) @@ -100,7 +102,6 @@ ; GFX9-O3: v_mov_b32_e32 v2, [[ARG]] - ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0